Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.

library(knitr)
library(reticulate)

#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)

#Data wrangling and data viz
library(tidyverse)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)

#Pop structure and pop genomic
library(SNPRelate) #PCA
library(LEA) #Clustering
library(pophelper)
library(PopGenome) #Summary statistics
library(gridExtra)

library(ggtree)
library(tidytree)


#GEA
library(lfmm)

#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)


#Variables
world <- map_data("world")
project_dir="/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/"

#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")

#Analysis directories
#--------------------
VAR_dir = paste0(project_dir, "1_Variant_calling/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
   RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
   DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
  gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")

#Files
#vcf_name=paste0(project_dir, "Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.maf-0.05.thin-1000bp")
vcf_name="Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.maf-0.05.thin-1000bp"
#vcf_name_nomaf=paste0(project_dir, "Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.thin-1000bp")
vcf_name_nomaf="Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.thin-1000bp"
Zt_list = paste0(metadata_dir, "Ztritici_list")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")

gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))

Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)

Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)

#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)


#py_install(packages = 'biopython')

#Prettier correlation colors
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)


greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
# 1 - Variant calling
rsync -avP \
  alice@130.125.25.187:/data2/alice/WW_project/1_Variant_calling/1* \
  /Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/

rsync -avP \
  alice@130.125.25.187:/data2/alice/WW_project/1_Variant_calling/2* \
  /Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/


# 4 - TE and RIP
rsync -avP \
  alice@130.125.25.187:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/*txt \
  /Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
#Composite_index.txt
#GC_percent.txt
#Nb_reads.txt
#Nb_reads_per_TE.txt

rsync -avP \
  alice@130.125.25.187:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/*txt \
  /Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/

Sampling


vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
--depth --out ${VCFDIR}${VCFNAME}
depth = read_delim(paste0(vcf_dir, vcf_name, ".idepth"), delim = "\t")


Zt_meta=readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_onlyWW.xlsx"),
                           sheet = 1, n_max = 1000) %>%
  filter(Species == "Ztritici") %>%
  unite(Coordinates, Latitude, Longitude, sep = ";", remove = F)

Zt_meta_future=readxl::read_excel(paste0(metadata_dir, "New_sequencing.xlsx"),
                           sheet = 1, n_max = 1000)

Zt_meta = Zt_meta %>%
  filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
  filter(!is.na(Country))

Zt_meta %>%
  filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
  inner_join(., depth %>% filter(MEAN_DEPTH >= 5), by = c("ID_file" = "INDV")) %>%
  dplyr::select(ID_file) %>% write_delim(paste0(Zt_list, ".txt"))


Zt_meta$Collection[is.na(Zt_meta$Collection)] <- "Other"


#Define stable colors
#myColors <- brewer.pal(7, "Dark2")
myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
names(myColors) = levels(factor(Zt_meta$Continent_Region))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent_Region", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent_Region", values = myColors)

The sampling of Z.tritici isolated from the natural environment covers almost the entirety of the wheat-grown continents. It is, however, highly heterogeous. Europe has the highest sampling density. Several locations are heavily sampled, such a fields in Switzerland or the US. Some of the available genomes are still under embargo (waiting for the publication of their creator). We are also thinking about sequencing more genomes. Here you can view these different datasets on a map. Please select and unselect the different values to have a view of the changes with added datasets.

  • “Usable genomes” are the genomes we have already and which have no embargo on publication.
  • “Present and future” are the isolates sequenced already and without embargo plus the planned sequencing
  • “Present, future and JGI” are the isolates described above plus the JGI genomes
Zt_meta_for_shiny = bind_rows(
  Zt_meta_future %>%
    mutate(Sampling_collection = "Future sequencing"),
  Zt_meta %>%
    filter(!is.na(Latitude)) %>%
    mutate(Sampling_collection = ifelse((Duplicate_Clone_LowQual_Flags != "Publication_embargo" |
                                           is.na(Duplicate_Clone_LowQual_Flags)),"Usable_genomes",
                                      "Embargoed genomes")))

kable(Zt_meta_for_shiny %>% dplyr::count(Sampling_collection, name = "Number of genomes"))
Sampling_collection Number of genomes
Embargoed genomes 148
Future sequencing 224
Usable_genomes 727
max_circle = max(Zt_meta_for_shiny %>%
  dplyr::count(Latitude, Longitude, name = "Number_genomes") %>%
    dplyr::select(Number_genomes))

combinations=list(
  c("Usable genomes", "Usable_genomes"), c("Usable genomes and JGI", "Usable_genomes", "Embargoed genomes"),
               c("Present and future", "Usable_genomes", "Embargoed genomes", "Future sequencing"))
temp_list = list()
for (i in c(1:3)) {

  y = Zt_meta_for_shiny %>%
   filter(Sampling_collection %in% combinations[[i]]) %>%
  dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
  filter(Number_genomes > 0) %>%
   mutate(Which_isolates = combinations[[i]][[1]])
  temp_list[[i]] = y
}
temp = do.call(rbind, temp_list)

temp = Zt_meta_for_shiny %>%
   filter(Sampling_collection == "Usable_genomes" | Sampling_collection == "Embargoed genomes") %>%
   dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
   filter(Number_genomes > 0)

isolate_map = ggplot() + theme_void() +
  geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="#ede7e3", alpha=0.7) +
  geom_point (data = temp, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
                                    size=Number_genomes,
                                    text = Country),
                                  alpha = 0.6, color = "#16697a") +
  scale_size("Number of genomes", limits = c(1, max_circle))

#ggplotly(isolate_map)
#isolate_map
temp2 = Zt_meta_for_shiny %>%
   filter(Sampling_collection == "Future sequencing") %>%
   dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
   filter(Number_genomes > 0)
isolate_map +
  geom_point (data = temp2, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
                                    size=Number_genomes,
                                    text = Country),
                                  alpha = 0.6, color = "#FFBA08") +
  scale_size("Number of genomes", limits = c(1, max_circle))

The sampling also covers a wide range of years: starting from 1990 to 2017. Just as with the geographical repartition, some years are heavily sampled, reflecting sampling in specific fields done for previous experiments.

temp = as_tibble(c(min(Zt_meta_for_shiny$Date_Year, na.rm = T) : max(Zt_meta_for_shiny$Date_Year, na.rm = T))) %>%
  mutate(`Sampling year` = as.character(value))

sum_temp = Zt_meta_for_shiny %>%
    mutate(`Sampling year` = as.character(Date_Year)) %>%
    dplyr::count(`Sampling year`, Continent_Region) %>%
    full_join(., temp) %>%
    mutate(`Genome number` = replace_na(n, 0))

sum_temp %>%
ggplot(aes(x=`Sampling year`, y =`Genome number`, fill = Continent_Region)) +
  geom_bar(stat = "identity") +
  theme_bw() + theme(axis.title = element_blank(),
                     axis.text.x = element_text(angle = 60, hjust = 1)) +
  Fill_Continent



Variant calling


Whole chromosomes SNV

Question: How prelavent is aneuploidy in natural populations of Z.tritici? In the case of accessory chromosomes, is there a correlation between phylogeny, environment, host or time and the presence/absence of some chromosomes?

Methods: Based on the depth of coverage for all samples, we can identify for both core and accessory chromosomes whether each isolates includes 1, 0 or several copies. Caveat: I cheated and used the depth from the vcf file with the SNP subset to get a preliminary analysis. The proper analysis should be done on the bam files.

#NB: this table is currently not used. See TODO in next chunk.
nb_sites_table=${VCFDIR}Nb_sites.txt
echo "CHROM NB_SITES" > ${nb_sites_table}
for i in {1..21};
do
  vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
   --out ${VCFDIR}${VCFNAME}.chr_${i} \
   --chr ${i} \
   --depth &> \
   /${VCFDIR}${VCFNAME}.chr_${i}.log ;
  nb_sites=$(grep "After filtering" ${VCFDIR}${VCFNAME}.chr_${i}.log \
   | grep "Sites" | cut -f 4 -d " ")
  echo chr_${i} ${nb_sites} >> $nb_sites_table
done

rm ${VCFDIR}${VCFNAME}.chr_*.log
depth_prefix = paste0(vcf_dir, vcf_name, ".chr_")
depth_suffix = ".idepth"

depth_results = list()
coverage_results = list()
d = 0
for (i in c(1:21)) {
  d = d + 1
  temp = read.table(paste0(depth_prefix, i, depth_suffix), header = TRUE, na.strings = "NaN")
  temp$CHROM = i
  temp$NORM_N_SITES = temp$N_SITES/max(temp$N_SITES) #TODO shift to using table created above
  depth_results[[d]] = temp
}

depth = bind_rows(depth_results)
depth$MEAN_DEPTH[is.na(depth$MEAN_DEPTH)] = 0
summary_depth = depth[depth$CHROM < 14,] %>%
  dplyr::group_by(INDV) %>%
  dplyr::summarise(core_genome_depth = median(MEAN_DEPTH, na.rm = TRUE), coverage = sum(N_SITES))


depth =full_join(depth, summary_depth[,c(1,2)])


depth$NORM_MEAN_DEPTH = depth$MEAN_DEPTH / depth$core_genome_depth
depth$NORM_MEAN_DEPTH_CAPPED = ifelse(depth$NORM_MEAN_DEPTH < 2, depth$NORM_MEAN_DEPTH, 2)

depth = inner_join(Zt_meta, depth, by = c("ID_file" = "INDV")) %>%
  mutate(Sample = fct_reorder(ID_file, Date_Year)) %>%
  mutate(Sample = fct_reorder(Sample, Country)) %>%
  mutate(Sample = fct_reorder(Sample, Continent_Region))

In the heatmap, I represent the depth normalized by the median depth over all core chromosomes. As expected, the copy-number variation at the chromosome scale affects mostly the accessory chromosomes (AC). There is some presence of supernumerary AC and a lot of presence-absence variation. Supernumerary chromosomes can also be found in the core chromosomes but this is almost anecdotal as over the whole sampling this was found only in 9 cases.

heatmap_depth = ggplot(data = depth, aes(x = CHROM, y=Sample, fill=NORM_MEAN_DEPTH)) +
  geom_tile() + scale_fill_gradient(low="white", high = "#16697a") +
  geom_vline(xintercept = 13.5, linetype = "longdash", colour = "gray20") +
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(), axis.ticks.y=element_blank()) +
  labs(fill = "Depth") + xlim(c(0.5, 21.5))
    labs(x= "Chromosome")
## $x
## [1] "Chromosome"
## 
## attr(,"class")
## [1] "labels"
plot_continent = ggplot(data = depth, aes(x = 1, y=Sample, fill=Continent_Region)) +
  geom_tile(aes(width = 2))  +
  theme_classic() +
  theme(axis.text.y = element_blank(), axis.ticks.y=element_blank(),
        legend.position="left",
        axis.text.x=element_text(colour="white")) +
  scale_fill_brewer(palette = "Dark2") +
    labs(y= "Isolate") + Fill_Continent

plot_grid(plot_continent, heatmap_depth, rel_widths = c(2, 5))

Lthres = 0.50
Hthres = 1.50
depth = depth %>%
  dplyr::mutate(Depth_is = ifelse(NORM_MEAN_DEPTH > Hthres, "High",ifelse(NORM_MEAN_DEPTH < Lthres, "Low", "Normal")))

bar_Ndepth_per_CHR =ggplot(depth, aes(x = CHROM, fill = Depth_is)) +
  geom_bar(stat = "count") +
  scale_fill_manual(values =c("#16697a", "#82c0cc", "#EDE7E3")) +
    theme_light()+
    labs(x= "Chromosome", y = "Number of isolates")

#lollipop plots
##For high normalized depth values
temp = depth %>%
  filter(Depth_is == "High") %>%
  dplyr::group_by(CHROM) %>%
  dplyr::count()
lolhigh =  ggplot(temp, aes(x = as.character(CHROM), y = n)) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
                  color="grey80", size = 1) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
                  color="grey20", size = 1) +
    geom_point( color="#16697a", size=4)  +
    geom_text(aes( label = n,
                     y= n), stat= "identity",
              hjust = -0.5, vjust = -0.2) +
    theme_light() +
    theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
    ) +
    ylim(c(0,max(temp$n)+2+ max(temp$n)*0.1)) +
    labs(x= "Chromosome", y = "Number of isolates with supernumerary chromosome") +
    coord_flip()

##For low normalized depth values
temp = depth %>%
  filter(Depth_is == "Low") %>%
  dplyr::group_by(CHROM) %>%
  dplyr::count()

lollow = ggplot(temp, aes(x = as.character(CHROM), y = n)) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
                  color="grey80", size = 1) +
    geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
                  color="grey20", size = 1) +
    geom_point( color="#82c0cc", size=4)  +
    geom_text(aes( label = n,
                     y= n), stat= "identity",
              hjust = -0.5, vjust = -0.2) +
    theme_light() +
    theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
    ) +
    ylim(c(0,max(temp$n)+ max(temp$n)*0.1)) +
    labs( x= "Chromosome", y = "Number of isolates without chromosome") +
    coord_flip()

bottom_row <- cowplot::plot_grid(lolhigh, lollow, labels = c('B', 'C'), label_size = 12)

plot_grid(bar_Ndepth_per_CHR, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)

Here is a table including the isolates with supernumerary core chromosomes.

depth  %>%
  dplyr::filter(Depth_is == "High") %>%
  dplyr::filter(CHROM < 13)
## # A tibble: 17 x 40
##    ID_file Isolate_name1 Isolate_name2 Duplicate_Clone… Species Continent_Region
##    <chr>   <chr>         <chr>         <chr>            <chr>   <chr>           
##  1 STnnJG… Gp0153947     IPO-92044     Publication_emb… Ztriti… Africa          
##  2 STnnJG… Gp0153947     IPO-92044     Publication_emb… Ztriti… Africa          
##  3 STnnJG… Gp0153950     IPO-98114     Publication_emb… Ztriti… Europe          
##  4 ST_SRR… IPO323        IPO323        Publication_emb… Ztriti… Europe          
##  5 ST_SRR… IPO323        IPO323        Publication_emb… Ztriti… Europe          
##  6 ST_SRR… IPO323        IPO323        Publication_emb… Ztriti… Europe          
##  7 ST_SRR… IPO323        IPO323        Publication_emb… Ztriti… Europe          
##  8 STnnJG… Gp0153927     IPO-10013     Publication_emb… Ztriti… Europe          
##  9 STnnJG… Gp0153928     IPO-10014     Publication_emb… Ztriti… Europe          
## 10 STnnJG… Gp0128316     IPO-95001     Publication_emb… Ztriti… Europe          
## 11 STnnJG… Gp0128313     IPO-94243     Publication_emb… Ztriti… North America   
## 12 ST16DK… DK15          INRA16-FS0671 <NA>             Ztriti… Europe          
## 13 ST16FR… FR1           INRA16-FS0794 <NA>             Ztriti… Europe          
## 14 ST16FR… FR1           INRA16-FS0794 <NA>             Ztriti… Europe          
## 15 ST13SP… INRA13-LG1979 SeptoDur104   <NA>             Ztriti… Europe          
## 16 ST90OR… a12_3B_11     <NA>          <NA>             Ztriti… North America   
## 17 ST00AR… ARG BD0069    <NA>          <NA>             Ztriti… South America   
## # … with 34 more variables: Cultivar <chr>, Bread_Durum_Wheat <chr>,
## #   Collection <chr>, Plot <dbl>, Collector_Publication <chr>, Country <chr>,
## #   Region_1 <chr>, Region_2 <chr>, City <chr>, Latitude <dbl>,
## #   Coordinates <chr>, Longitude <dbl>, Date_Year <dbl>,
## #   `Publication/comments` <lgl>, PacBio_Genome <lgl>, Library_strategy <chr>,
## #   Phred <chr>, R1_1 <chr>, R1_2 <chr>, R2_1 <chr>, R2_2 <chr>, R1cat <chr>,
## #   R2cat <chr>, bam <chr>, gvcf <chr>, N_SITES <int>, MEAN_DEPTH <dbl>,
## #   CHROM <int>, NORM_N_SITES <dbl>, core_genome_depth <dbl>,
## #   NORM_MEAN_DEPTH <dbl>, NORM_MEAN_DEPTH_CAPPED <dbl>, Sample <fct>,
## #   Depth_is <chr>

And an overlook of the accessory chromosomes PAV per continent (only considering continents with more than 10 isolates).

depth %>%
  dplyr::filter(CHROM > 13) %>%
  dplyr::group_by(Continent_Region, CHROM) %>%
  dplyr::mutate(Count_sample_per_continent = n()) %>%
  dplyr::filter(Count_sample_per_continent >= 10) %>%
  ggplot(aes(x = Continent_Region, fill = Depth_is)) +
  geom_bar(position = "fill" ) + facet_wrap(CHROM~.) +
  theme_light() +
  scale_fill_manual(values = c("#16697a", "#82c0cc", "#EDE7E3")) +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylab("Proportion of chromosomes") + xlab("")

Genes PAV

Question: Could genes PAV be related to phylogeny, environment, host or time? What about predicted effector genes?

Methods: Based on depth of coverage + SGSGeneloss For this section, iI expanded the gene selection from all genes in the pangenome. This means remapping everything unfortunately, but it captures more or the overall diversity and might be the only way to catch some population specific variants.

See PAV from pangenome file (had to be separated because it was way too long to run)

depth_per_gene = read_tsv(paste0(VAR_dir,"2_Depth_per_gene/All_samples.tab")) %>%
  left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000))
depth_per_PacBio = depth_per_gene %>%
  filter(grepl("\\[", Locus_name)) %>%
  separate(col = Locus_name,
           sep = "\\.",
           into= c("PacBio", "rest"),
           remove = F,
           fill = "left") %>%
  group_by(ID_file, PacBio, Continent_Region) %>%
  dplyr::summarise(Median_depth = median(as.numeric(Depth), na.rm = T),
                   Nb_windows = n())  %>%
  inner_join(., summary_depth, by=c("ID_file" = "INDV")) %>%
  mutate(Normalized_depth = Median_depth / core_genome_depth)


for_pheatmap = spread(depth_per_PacBio[, c(1, 2, 8)], "PacBio", "Normalized_depth")
for_pheatmap = Zt_meta %>% dplyr::select(ID_file, Continent_Region) %>%
  inner_join(., for_pheatmap) %>%
  filter(complete.cases(.))

temp = as.data.frame(for_pheatmap)
rownames(temp) = temp[,1]
temp = temp[,-1]
my_palette <- colorRampPalette(c("white", "black"))(n = 299)

annot_continent = as.data.frame(temp[, 1])
row.names(annot_continent) <- row.names(temp)
colnames(annot_continent) <- "Continent"


color_PacBio = data.frame(Continent = c("Europe", "Europe",  "Europe", "Europe", "South America",
                           "Oceania", "South America", "North America", "Europe",
                           "North America", "Middle East", "Middle East", "Middle East", "Africa",
                           "North America", "Africa", "Europe", "Middle East"))

rownames(color_PacBio) <- c("1A5", "1E4",  "3D1", "3D7", "Arg00_1D1a1",
                           "Aus01_1H8", "CH95_3B2a", "CNR93_D3a1", "CRI10_0616",
                           "I93_11a_2", "IR01_26b", "IR01_48b", "ISY92_Ar4f", "KE94_MF1a",
                           "OregS90_a15_4A_10", "TN09_26_2-0", "UR95_E2C_2", "YEQ92_4")

my_colour = list(Continent = myColors)



pheatmap(as.matrix(temp[,c(2:ncol(temp))]),
         color=my_palette,
         cluster_cols = TRUE,
         cluster_rows=TRUE,
         border_color = NA,
         show_rownames = F,
         annotation_row = annot_continent,
         annotation_col = color_PacBio,
         annotation_colors = my_colour)

Let’s attempt to create a PCA based on the normalized depth per gene.

library(FactoMineR)
depth_genes_PCA = depth_per_gene %>%
  dplyr::select(ID_file, Gene_ID, Normalized_depth) %>%
  mutate(Normalized_depth = as.numeric(Normalized_depth)) %>%
  pivot_wider(names_from = Gene_ID, values_from = Normalized_depth)

temp = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),2:ncol(depth_genes_PCA)]
temp2 = temp[, sample(ncol(temp), size=1000,replace=FALSE)]
temp2 = sapply( temp2, as.numeric )

res.pca = PCA(temp2, scale.unit=TRUE, ncp=10, graph=F)

results = as.tibble(res.pca$ind$coord) %>%
   mutate(ID_file = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),1]) %>%
  left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000))

p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Continent_Region)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
       y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
  Color_Continent +
  theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Collection)) +
  geom_point() +
  labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
       y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
  theme_bw()
ggplotly(p)

Different datasets have very different depth of coverage. It is possible that this explains thee clustering observed above. What if I simplified the matrix to include only rounded numbers which would be an approximation of CNV?

library(FactoMineR)
depth_genes_PCA = depth_per_gene %>%
  filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
  dplyr::select(ID_file, Gene_ID, Rounded_norm)  %>%
  pivot_wider(names_from = Gene_ID, values_from = Rounded_norm)


temp = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),2:ncol(depth_genes_PCA)]
temp2 = temp[, sample(ncol(temp), size=1000,replace=FALSE)]
temp2 = sapply( temp2, as.numeric )

res.pca = PCA(temp2, scale.unit=TRUE, ncp=10, graph=F)

results = as.tibble(res.pca$ind$coord) %>%
   mutate(ID_file = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),1]) %>%
  left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000))

p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Continent_Region)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
       y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
  Color_Continent +
  theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.2, y = Dim.3, text = ID_file, color = Continent_Region)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 2 (", round(res.pca$eig[1,2]), "%)"),
       y = paste0("PC 3 (", round(res.pca$eig[2,2]), "%)")) +
  Color_Continent +
  theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Collection)) +
  geom_point() +
  labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
       y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
  theme_bw()
ggplotly(p)

Hum… Something looks really wrong! Not sure why. Let’s look at it again at some point.

Genes CNV

depth_per_gene %>%
  filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
  filter(Rounded_norm > 1) %>%
  filter(!is.na(Continent_Region)) %>%
  group_by(ID_file, Continent_Region) %>%
  summarize(Nb_duplicated_genes = n()) %>%
  #filter(Nb_duplicated_genes < 400) %>%
  ggplot(aes(x = Continent_Region,
             y = Nb_duplicated_genes,
             fill = Continent_Region)) +
    geom_violin() +
    geom_boxplot(color = "white", width=0.08, outlier.shape = NA) +
    Fill_Continent +
    theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 10, angle = 20, hjust = 1)) +
  labs( x = "", y = "Number  of genes",
        title = str_wrap(paste0("How many genes have a copy number ",
                                "higher than one?"),  width = 60),
        subtitle = "The estimation of the copy number is based on depth of coverage.")

duplicated_genes = depth_per_gene %>%
  filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
  filter(Rounded_norm > 1) %>%
  filter(Rounded_norm <= 2) %>%
  dplyr::filter(!is.na(Continent_Region)) %>%
  group_by(ID_file, Continent_Region) %>%
  summarize(Nb_duplicated_genes = n())
  #filter(Nb_duplicated_genes < 400) %>%

ggplot(duplicated_genes, aes(x = Continent_Region,
             y = Nb_duplicated_genes,
             fill = Continent_Region)) +
    geom_violin() +
    geom_boxplot(color = "white", width=0.08, outlier.shape = NA) +
    Fill_Continent +
    theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 10, angle = 20, hjust = 1)) +
  labs( x = "", y = "Number of genes",
        title = str_wrap(paste0("How many genes have a copy number ",
                                "of two?"),  width = 60),
        subtitle = "The estimation of the copy number is based on depth of coverage.")

depth_per_gene %>%
  filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
  filter(Rounded_norm > 1) %>%
  filter(Rounded_norm <= 2) %>%
  filter(!is.na(Continent_Region)) %>%
  group_by(ID_file, Continent_Region) %>%
  summarize(Nb_duplicated_genes = n())
## # A tibble: 300 x 3
## # Groups:   ID_file [300]
##    ID_file     Continent_Region Nb_duplicated_genes
##    <chr>       <chr>                          <int>
##  1 ST01AUS_1A4 Oceania                           54
##  2 ST01AUS_1A5 Oceania                           36
##  3 ST01AUS_1A6 Oceania                           71
##  4 ST01AUS_1A9 Oceania                           45
##  5 ST01AUS_1B1 Oceania                           80
##  6 ST01AUS_1B2 Oceania                           69
##  7 ST01AUS_1B7 Oceania                           77
##  8 ST01AUS_1B8 Oceania                           66
##  9 ST01AUS_1C1 Oceania                           59
## 10 ST01AUS_1C2 Oceania                           50
## # … with 290 more rows
depth_per_gene %>%
  filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
  dplyr::select(ID_file, Gene_ID, Rounded_norm)  %>%
  pivot_wider(names_from = ID_file, values_from = Rounded_norm) %>%
  write_tsv(paste0(VAR_dir,"2_Depth_per_gene/Rounded_depth_per_gene_per_sample.tab"))

Just creating a file to use on the cluster


out = open(r.sel_dir + "Only_unique_copies.tab", "w")
with open(r.VAR_dir + "2_Depth_per_gene/Rounded_depth_per_gene_per_sample.tab") as depth_input :
  for i, line in enumerate(depth_input) :
    
    if i == 0 :
      header = line.strip().split("\t")
    else :
      line_sp = line.strip().split("\t")
      gene = line_sp[0]
      sample_list = [gene]
      for sample, CNV in zip(header[1:], line_sp[1:]) :
        if CNV == "1" :
          sample_list.append(sample)
      p=out.write("\t".join(sample_list) + "\n")

out.close()

print("I'm done.")
## I'm done.
rm(depth_per_gene)



Population structure / biogeography


Question: Is the world-wide population of Z.tritici structured? If so, is it structured according to geography, host or time (or any other relevant info we hopefully have)?

Previous genomic work has shown very clear structure between populations of Z.tritici. However,the sampling was extremely heterogeneous. With a more geographically even sampling, do we also observe a clear-cut structure?
Methods: Analyses to create would be:

  • PCA
  • Structure-like analysis,
  • Tree (rooted on Za to infer origin if pattern?) #TODO
  • Map with cluster if clusters #TODO

Principal Component Analysis

PCA with all continents

As a first method to investigate the population structure of Z.tritici at the world-wide scale, I chose to do a principal component analysis based on a subset of the SNPs. This PCA confirms previous results of a geographically structured species. Oceania emerges quite distincly as three separate clusters: one in New-Zealand and two Australian (see below for a more in-depth analysis of this pattern). North-America is also quite serapate. The distinction between the rest of the geographical location is not clear in this PCA, although PC3 might show a slight differenciation between Europe and the Middle-East.


vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
  --keep $ZTLIST.txt \
  --non-ref-ac-any 1 \
  --remove-filtered-all --recode --recode-INFO-all \
  --out ${VCFDIR}${VCFNAME}.pass
snpgdsVCF2GDS(paste0(vcf_dir, vcf_name, ".pass.recode.vcf"),
              paste0(PopStr_dir, vcf_name, ".pass.recode.gds"), method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)

pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4)
colnames(pca2) = c("PC1", "PC2", "PC3", "PC4")
pca2 = pca2 %>%
  dplyr::mutate(sample_id = pca$sample.id ) %>%
  dplyr::right_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
  unite(sample_id, Country, col = "for_display", remove = F)

as.tibble(pca$eigenval[!is.na(pca$eigenval)]) %>%
  ggplot(aes(x = c(1:length(pca$eigenval[!is.na(pca$eigenval)])),
             y =value)) + geom_point() +
  theme_bw()

eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = PC1, y= PC2, text = for_display)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
  Color_Continent +
  theme_bw()

#p
ggplotly(p)
q = ggplot(pca2, aes(x = PC3, y= PC4, text = for_display)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
  Color_Continent +
  theme_bw()

#q
ggplotly(q)
p = ggpairs(pca2, columns = c(1:4),
            ggplot2::aes(col=Continent_Region, fill = Continent_Region, alpha = 0.6),
            title = "PCA based thinned SNPs",
            upper = list(continuous = "points", combo = "box_no_facet"))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
  }
}

p

PCA without Oceania

In order to investiage more precisely the population structure in Europe, the Middle-East and America, I removed the Oceanian isolates and recreated the PCA. This highlights the different between the North America samples and the rest of the samples, in Europe, South America and the Middle-East. These last locations do not separate as distinct clusters but as a gradient, visible on PC2 (from Europe to the Middle-East) and PC4.

Zt_meta  %>%
  filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
  #filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "Publication_embargo") %>%
  inner_join(., depth %>% filter(MEAN_DEPTH >= 5)) %>%
  filter(Continent_Region != "Oceania") %>%
  dplyr::select(ID_file) %>% write_delim(paste0(PopStr_dir, "Zt_list_wo_oceania.txt"))
vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
  --keep ${POPSTR}Zt_list_wo_oceania.txt \
  --remove-filtered-all --recode --recode-INFO-all \
  --non-ref-ac-any 1 \
  --out ${POPSTR}${VCFNAME}.pass.wo_oceania
snpgdsVCF2GDS(paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.vcf"),
              paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.gds"),
              method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)

pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4) %>%
  mutate(sample_id = pca$sample.id ) %>%
  right_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
  unite(sample_id, Country, col = "for_display", remove = F)

eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = V1, y= V2, text = for_display)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
  Color_Continent +
  theme_bw()

#p
ggplotly(p)
q = ggplot(pca2, aes(x = V3, y= V4, text = for_display)) +
  geom_point(aes(color = Continent_Region)) +
  labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
  Color_Continent +
  theme_bw()

#q
ggplotly(q)

PCA: Oceania only

This subsection was created to satisfy my own curiosity, but I guess this is what Megan McDonald is doing at the moment? I might remember wrongly from her presentation, but there was something about a resistance to a new fungicide between the two Australian collection.

Zt_meta %>%
  filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
  filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "Publication_embargo") %>%
  inner_join(., depth %>% filter(MEAN_DEPTH >= 5)) %>%
  filter(Continent_Region == "Oceania") %>%
  dplyr::select(ID_file) %>% write_delim(paste0(PopStr_dir, "Zt_list_only_oceania.txt"))

vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --keep ${POPSTR}Zt_list_only_oceania.txt \
  --remove-filtered-all --recode --recode-INFO-all \
  --non-ref-ac-any 1 \
  --out ${POPSTR}$VCFNAME.pass.only_oceania
snpgdsVCF2GDS(paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.vcf"),
              paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.gds"),
              method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)

pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4) %>%
  mutate(sample_id = pca$sample.id ) %>%
  inner_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
  unite(sample_id, Country, col = "for_display", remove = F)

eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = V1, y= V2, text = for_display)) +
  geom_point(aes(color = as.character(Date_Year), shape = Country)) +
  labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
      scale_color_manual(values = blues) +
  theme_bw()

#p
ggplotly(p)
q = ggplot(pca2, aes(x = V3, y= V4, text = for_display)) +
  geom_point(aes(color = as.character(Date_Year), shape = Country)) +
  labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
       y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
      scale_color_manual(values = blues) +
  theme_bw()

#q
ggplotly(q)

Structure-like clustering

The clustering here is done by using the snmf method from the LEA R package (http://membres-timc.imag.fr/Olivier.Francois/LEA/) on the same subset of SNPs as the PCA, but without any missing data. I ran the analysis for a K (number of cluster inferred) ranging from 1 to 15 and with 10 repeats for each K.



vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --keep $ZTLIST.txt \
  --remove-filtered-all --extract-FORMAT-info GT \
  --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
  --maf 0.05 \
  --out ${POPSTR}$VCFNAME.pass_noNA


cat  ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT | cut -f 3- \
    > ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2
cat  ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT | cut -f 1,2 \
    > ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 | gsed "s/\t/\n/g"  \
    > ${POPSTR}$VCFNAME.pass_noNA.ind
gsed "s/\t//g"  ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 | tail -n +2 \
    > ${POPSTR}$VCFNAME.pass_noNA.geno
project = snmf(paste0(PopStr_dir, vcf_name, ".pass_noNA.geno"), K=1:15, entropy = TRUE,
                        repetitions = 10, project = "new", ploidy = 1)

First, let’s look at the cross-validation results. sNMF estimates an entropy criterion which evaluates the quality of fit of the model to the data, potentially helping to find the best number of ancestral populations.

project = load.snmfProject(paste0(PopStr_dir, vcf_name, ".pass_noNA.snmfProject"))

plot(project, col = "goldenrod", pch = 19, cex = 1.2)

In the case of our analysis, we do not have a very clear-cut minimum value for the cross-entropy criterion value. There is however a plateau starting from K=6. I chose to represent as barplots the best run for each value of K (as defined as the one with the lowest cross-entropy value). This seem to confirm the choice of a reasonable K value at 6 since no new major cluster seem to appear in the at higher values. It is possible again that the Oceanian samples would separate more as in the PCA at higher values of K.

The results from the PCA and from the clutering analysis are coherent: Oceania separates into 3 clusters (one in New_Zealand, and two in Australia) and the North American isolates form a separate cluster. Higher K values also distinguish a Middle-Eastern/African cluster from the European cluster, representing the two extreme points of the gradient found between these populations in the PCA.

indv_snmf = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.ind"), col_names = F)
names(indv_snmf) = "Sample"

datalist = list()
for (i in c(2, 3, 4, 5, 6, 7, 8)){
  best = which.min(cross.entropy(project, K = i))
  temp = as.data.frame(Q(project, i, best))
  temp= cbind(indv_snmf, temp)

  temp = temp %>%
    gather("Cluster", "Admix_coef", -"Sample") %>%
    mutate(K=i)
   datalist[[i]] = as.tibble(temp)
}
snmf_results_per_K = bind_rows(datalist) %>%
  inner_join(., Zt_meta, by = c("Sample" = "ID_file"))  %>%
  unite(Continent_Region, Country, col = "for_display", remove = F)  %>%
  mutate(Sample = fct_reorder(Sample, Date_Year)) %>%
  mutate(Sample = fct_reorder(Sample, Country)) %>%
  mutate(Sample = fct_reorder(Sample, Continent_Region))

p = ggplot(snmf_results_per_K, aes(x = Sample, y = Admix_coef, fill = Cluster, text = for_display)) +
  geom_bar(position = "stack", stat = "identity") + facet_grid(K~.) +
  theme_bw() + theme(axis.title = element_blank(),
                     axis.text.x = element_blank(),
                     legend.title = element_blank()) #element_text(angle = 60, hjust = 1))

ggplotly(p)
K_list = c(1:15)
afiles = character(length(K_list))
for (i in K_list){
  best = which.min(cross.entropy(project, K = i))
  afiles[i] = Sys.glob(paste0(PopStr_dir, vcf_name, ".pass_noNA.snmf/K",i, "/run", best, "/*Q"))
}

# create a qlist
qlist <- readQBasic(afiles)
al_qlist = alignK(qlist)

lab_set = inner_join(indv_snmf, Zt_meta, by = c("Sample" = "ID_file")) %>%
  dplyr::select(Continent_Region, Country)
from = 4
up_to = 7
p1 <-   plotQ(alignK(qlist[from:up_to], type = "across"),
            imgoutput="join",
            returnplot=T,exportplot=F,
            quiet=T,basesize=11,
            splab= paste0("K=", K_list[from:up_to]),
            grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2)
#TODO: add own colors clustercol=c("#A6CEE3", "#3F8EAA", "#79C360", "#E52829", "#FDB762",)
grid.arrange(p1$plot[[1]])

chosen_K = 6
chosen_threshold = 0.8

snmf_results_per_K %>%
  filter(K == chosen_K) %>%
  dplyr::group_by(Continent_Region, for_display, Cluster) %>%
  dplyr::summarize(Ave_admix_coef = round(mean(Admix_coef), 2)) %>%
  dplyr::filter(Ave_admix_coef >= 0.02) %>%
  ggplot(aes(x = for_display, y = Cluster,
             size =  Ave_admix_coef, color = Continent_Region)) +
  geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    labs(x = "", title = "Average admixture coefficient per country")

#Looking at individuals with admixture coef higher than the threshold defined above.
high_anc_coef_snmf = snmf_results_per_K %>%
  filter(K == chosen_K) %>%
  filter(Admix_coef > chosen_threshold)

##Table
kable(high_anc_coef_snmf %>%
  dplyr::group_by(for_display, Cluster) %>%
  dplyr::count() %>%
  pivot_wider(names_from = Cluster, values_from = n, values_fill = list(n = 0)))
for_display V1 V6 V2 V3 V5 V4
Africa_Algeria 1 0 0 0 0 0
Africa_Ethiopia 1 0 0 0 0 0
Africa_Kenya 0 1 0 0 0 0
Africa_Morocco 1 0 0 0 0 0
Africa_Tunisia 3 2 0 0 0 0
Europe_Czech Republic 0 2 0 0 0 0
Europe_Denmark 0 9 0 0 0 0
Europe_France 1 123 0 0 0 0
Europe_Germany 0 5 0 0 0 0
Europe_Ireland 0 3 0 0 0 0
Europe_Italy 1 0 0 0 0 0
Europe_Latvia 0 2 0 0 0 0
Europe_Netherlands 0 5 0 0 0 0
Europe_Sweden 0 6 0 0 0 0
Europe_Switzerland 0 183 0 0 0 0
Europe_UK 0 8 0 0 0 0
Middle East_Iran 11 0 0 0 0 0
Middle East_Israel 32 0 0 0 0 0
Middle East_Syria 4 0 0 0 0 0
Middle East_Turkey 5 0 0 0 0 0
Middle East_Yemen 1 0 0 0 0 0
North America_Canada 0 0 2 0 0 0
North America_United States 0 0 163 0 0 0
Oceania_Australia 0 0 0 56 36 0
Oceania_New Zealand 0 0 0 0 0 29
##Pretty plot
p_cluster = high_anc_coef_snmf %>%
  dplyr::group_by(Continent_Region, for_display, Cluster) %>%
  dplyr::count() %>%
  ggplot(aes(x = for_display, y = Cluster,
             size = n, color = Continent_Region)) +
  geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    labs(x = "", title = "Number of genotypes with admix coef > 0.8")
p_cluster

##Writing out tables for later
high_anc_coef_snmf %>% dplyr::select(Sample) %>%
  write_tsv(., path = paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.ind"),
            col_names = F)
high_anc_coef_snmf %>%
  write_tsv(., path = paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.tsv"),
            col_names = T)

Note: I am not entirely sure about whether the high density of isolates from specific fields could bias the results. I remember it was the case with STRUCTURE, not sure if it is also the case of sNMF. To be checked.

Population trees

In the steps before, I have learned about population history indirectly by inferring genetic populations from the genomic data. The relationship between the population and the underlying demography is not explicit in these however. It is possible however to infer splits between populations and create a population tree. Here, I use treemix, which takes into account the possibility of gene flow between populations and indeed test of it in the process of creating a population tree.

Because the populations in the clustering were not perfectly distinct from one another, I start with “discretized” populations by choosing only the isolates with high ancestry in one of the sNMF clusters.

vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
  --keep ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.ind \
  --remove-filtered-all --extract-FORMAT-info GT \
  --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
  --maf 0.05 \
  --out ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf

cat  ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.GT.FORMAT | cut -f 3- \
   >  ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.GT.FORMAT2
from collections import defaultdict

#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.high_anc_coef_snmf["Sample"],
    r.high_anc_coef_snmf["Cluster"]))

#Keep a list of the pop names/coordinates to write in the same order later
all_pops = sorted(list(set(r.high_anc_coef_snmf["Cluster"])))
out_name = r.PopStr_dir + r.vcf_name + ".pass_noNA.high_anc_coef_snmf.treemix"


out = open(out_name, "w")
shutup = out.write(" ".join(all_pops) + "\n")

with open(r.PopStr_dir + r.vcf_name + ".pass_noNA.high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
  for i, snp in enumerate(input_snps) :
    #Setting two dictionaries with values at 0
    dict_snp0 = defaultdict(int)
    dict_snp1 = defaultdict(int)
    Lets_write = True

    #The first line is the name of the isolates
    if i == 0 :
      indv = snp.strip().split("\t")
      Lets_write = False
    else :
      #Keeping isolate name and allelic value together
      alleles = zip(indv, snp.strip().split("\t"))
      #...and counting the O and 1 based on the pop
      for ind, allele in alleles:
        if allele == "0" :
          dict_snp0[dict_pop[ind]] += 1
        elif allele == "1" :
          dict_snp1[dict_pop[ind]] += 1
        else :
          print("Only biallelic please!!!!")
          Lets_write = False
    #If I have not found anything weird, I will write the result to the output file.
    if Lets_write :
      shutup = out.write(" ".join([",".join([str(dict_snp0[pop]), str(dict_snp1[pop])])  for pop in all_pops]) + "\n")

out.close()
gzip ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix

treemix \
  -i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz \
  -o ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.out \
  -m 10 -root V1
source("/Users/feurtey/Documents/Software/treemix-1.13/src/plotting_funcs.R")
t = plot_tree(paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.treemix.out"))
##     V1   V2       V3      V4      V5 V6 V7 V8 V9 V10
## 1    0 <NA> NOT_ROOT NOT_MIG NOT_TIP 32  1  1 51   1
## 2    1   V5 NOT_ROOT NOT_MIG     TIP  0 NA NA NA  NA
## 3    2 <NA> NOT_ROOT     MIG NOT_TIP  0  1 NA NA  NA
## 4    3   V3 NOT_ROOT NOT_MIG     TIP 58 NA NA NA  NA
## 5    4   V2 NOT_ROOT NOT_MIG     TIP 32 NA NA NA  NA
## 6   15   V1 NOT_ROOT NOT_MIG     TIP 53 NA NA NA  NA
## 7   16 <NA> NOT_ROOT NOT_MIG NOT_TIP 53 58  2 32   3
## 8   31   V6 NOT_ROOT NOT_MIG     TIP 58 NA NA NA  NA
## 9   32 <NA> NOT_ROOT NOT_MIG NOT_TIP 16  0  2  4   1
## 10  51   V4 NOT_ROOT NOT_MIG     TIP  0 NA NA NA  NA
## 11  53 <NA>     ROOT NOT_MIG NOT_TIP 53 15  1 16   5
## 12  58 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 31  1  3   1
## 13  77 <NA> NOT_ROOT     MIG NOT_TIP  0 51 NA NA  NA
## 14 147 <NA> NOT_ROOT     MIG NOT_TIP 53 15 NA NA  NA
## 15 213 <NA> NOT_ROOT     MIG NOT_TIP  0  1 NA NA  NA
## 16 128 <NA> NOT_ROOT     MIG NOT_TIP 32  4 NA NA  NA
## 17 272 <NA> NOT_ROOT     MIG NOT_TIP 58  3 NA NA  NA
##                                                                                                                                   V11
## 1                                                                                                (V5:0.107996,V4:0.0728333):0.0169245
## 2                                                                                                                         V5:0.107996
## 3                                                                                                                                <NA>
## 4                                                                                                                         V3:0.228824
## 5                                                                                                                        V2:0.0388698
## 6                                                                                                                        V1:0.0223374
## 7                  ((V6:0.000589609,V3:0.228824):0.00303852,((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829):0.0217937
## 8                                                                                                                      V6:0.000589609
## 9                                                                      ((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829
## 10                                                                                                                       V4:0.0728333
## 11 (V1:0.0223374,((V6:0.000589609,V3:0.228824):0.00303852,((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829):0.0217937);
## 12                                                                                            (V6:0.000589609,V3:0.228824):0.00303852
## 13                                                                                                                               <NA>
## 14                                                                                                                               <NA>
## 15                                                                                                                               <NA>
## 16                                                                                                                               <NA>
## 17                                                                                                                               <NA>
##             x          y      ymin      ymax
## 1  0.04125649 0.33333333 0.1666667 0.5000000
## 2  0.11408983 0.41666667 0.3333333 0.5000000
## 3  0.10080531         NA 0.1666667 0.3333333
## 4  0.11594056 0.58333333 0.5000000 0.6666667
## 5  0.06313656 0.08333333 0.0000000 0.1666667
## 6  0.02233551 0.91666667 0.8333333 1.0000000
## 7  0.02179370 0.50000000 0.0000000 0.8333333
## 8  0.02485376 0.75000000 0.6666667 0.8333333
## 9  0.02433199 0.16666667 0.0000000 0.5000000
## 10 0.11408983 0.25000000 0.1666667 0.3333333
## 11 0.00000000 0.83333333 0.0000000 1.0000000
## 12 0.02457554 0.66666667 0.5000000 0.8333333
## 13 0.11374769         NA 0.1666667 0.3333333
## 14 0.02179191         NA 0.0000000 0.8333333
## 15 0.11408983         NA 0.1666667 0.3333333
## 16 0.02454141         NA 0.0000000 0.1666667
## 17 0.11412284         NA 0.5000000 0.6666667
## [1] "0.595458 0.04125649 0.114089833"
## [1] "0.995302 0.04125649 0.114089833"
## [1] "0.975662 0 0.0223355133752591"
## [1] "0.845105 0.04125649 0.114089833"
## [1] "0.00539676 0.02433199 0.0631365648855101"
## [1] "0.980105 0.0245755369176556 0.115940560291508"

## [1] 0.11408983 0.11594056 0.06313656 0.02233551 0.02485376 0.11408983
## [1] 0.003
## [1] "mse 0.000523137138888889"
p_cluster

Along with the treemix software, are distribution the software treepop and fourpop. These measure f3 and f4 statistics which test for treeness in population trees.

The three-population test is of the form f3(A;B;C), where a significantly negative value of the f3 statistic implies that population A is admixed. The output is four columns: populations | f3 statistic | standard error | Z-score

threepop -i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz -k 500
## npop:6 nsnp:11853
## Estimating covariance matrix in 23 blocks of size 500
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V3 0.0451248 0.00204994 22.0127
## V2;V1,V3 0.0403089 0.00204847 19.6776
## V3;V1,V2 0.103211 0.00502406 20.5434
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V4 0.0466498 0.00228677 20.3999
## V2;V1,V4 0.0387839 0.00193089 20.086
## V4;V1,V2 0.0897219 0.00566013 15.8515
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V5 0.0449815 0.00207906 21.6355
## V2;V1,V5 0.0404522 0.00192851 20.9759
## V5;V1,V2 0.11394 0.0034061 33.4519
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V6 0.045036 0.00181266 24.8452
## V2;V1,V6 0.0403977 0.00165836 24.3601
## V6;V1,V2 0.00105419 0.00086768 1.21495
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V4 0.0537752 0.00300243 17.9106
## V3;V1,V4 0.0945609 0.00492194 19.2121
## V4;V1,V3 0.0825965 0.00618335 13.3579
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V5 0.0726603 0.00290698 24.9951
## V3;V1,V5 0.0756758 0.00739709 10.2305
## V5;V1,V3 0.0862617 0.00232528 37.0973
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V6 0.045901 0.0021903 20.9565
## V3;V1,V6 0.102435 0.00498495 20.5489
## V6;V1,V3 0.000189219 0.000821908 0.230219
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V4,V5 0.0611872 0.00316706 19.3199
## V4;V1,V5 0.0751845 0.00758605 9.91088
## V5;V1,V4 0.0977347 0.00375752 26.0104
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V4,V6 0.0476207 0.00203434 23.4085
## V4;V1,V6 0.0887511 0.00532023 16.6818
## V6;V1,V4 -0.00153048 0.000778229 -1.96662
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V5,V6 0.0437645 0.00196063 22.3216
## V5;V1,V6 0.115157 0.00327818 35.1285
## V6;V1,V5 0.00232569 0.00106883 2.17591
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V4 0.0474343 0.00223 21.271
## V3;V2,V4 0.096086 0.00552364 17.3954
## V4;V2,V3 0.0810715 0.00612219 13.2422
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V5 0.0679877 0.00379085 17.9347
## V3;V2,V5 0.0755325 0.00733645 10.2955
## V5;V2,V3 0.086405 0.00227068 38.0525
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V6 0.0411739 0.0018313 22.4834
## V3;V2,V6 0.102346 0.0049528 20.6644
## V6;V2,V3 0.000278042 0.000613192 0.453434
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V4,V5 0.0549896 0.00290761 18.9123
## V4;V2,V5 0.0735161 0.0077444 9.49281
## V5;V2,V4 0.0994031 0.00362274 27.4386
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V4,V6 0.0413686 0.00177868 23.258
## V4;V2,V6 0.0871372 0.00546256 15.9517
## V6;V2,V4 8.33753e-05 0.000764 0.10913
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V5,V6 0.0391807 0.00173726 22.5531
## V5;V2,V6 0.115212 0.00346718 33.2293
## V6;V2,V5 0.00227121 0.000924398 2.45696
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V4,V5 0.0830879 0.00722354 11.5024
## V4;V3,V5 0.0940696 0.00791809 11.8803
## V5;V3,V4 0.0788496 0.00304438 25.9001
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V4,V6 0.0962806 0.00493025 19.5286
## V4;V3,V6 0.0808768 0.00599492 13.4909
## V6;V3,V4 0.00634375 0.00132474 4.78866
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V5,V6 0.0735394 0.0071297 10.3145
## V5;V3,V6 0.0883982 0.00224008 39.4621
## V6;V3,V5 0.029085 0.00281421 10.335
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V4;V5,V6 0.0713283 0.00717146 9.94614
## V5;V4,V6 0.101591 0.00373716 27.184
## V6;V4,V5 0.0158923 0.00226846 7.00577

The four-population test is of the form f4(A;B;C;D), where a significantly negative value of the f4 statistic implies gene flow in the tree. The output is four columns: populations | f4 statistic | standard error | Z-score

fourpop -i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz -k 500
## npop:6 nsnp:11853
## Estimating covariance matrix in 23 blocks of size 500
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V4 0.00152503 0.0012688 1.20195
## V1,V3;V2,V4 0.00865038 0.00161996 5.33986
## V1,V4;V2,V3 0.00712535 0.00201599 3.53442
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V5 -0.000143303 0.0011177 -0.128212
## V1,V3;V2,V5 0.0275355 0.00285563 9.64252
## V1,V5;V2,V3 0.0276788 0.00270388 10.2367
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V6 -8.88239e-05 0.000884867 -0.100381
## V1,V3;V2,V6 0.000776149 0.000851147 0.911885
## V1,V6;V2,V3 0.000864973 0.00107002 0.808374
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V4,V5 -0.00166833 0.00133229 -1.25223
## V1,V4;V2,V5 0.0145374 0.0028608 5.08158
## V1,V5;V2,V4 0.0162057 0.00262948 6.1631
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V4,V6 -0.00161385 0.00119698 -1.34827
## V1,V4;V2,V6 0.000970816 0.00128514 0.755418
## V1,V6;V2,V4 0.00258467 0.000893807 2.89176
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V5,V6 5.44793e-05 0.00112154 0.0485756
## V1,V5;V2,V6 -0.00121702 0.00080188 -1.51771
## V1,V6;V2,V5 -0.0012715 0.000993483 -1.27984
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V4,V5 0.0188851 0.0029896 6.31693
## V1,V4;V3,V5 0.00741206 0.00245444 3.01985
## V1,V5;V3,V4 -0.011473 0.0030933 -3.709
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V4,V6 -0.00787423 0.00160172 -4.91611
## V1,V4;V3,V6 -0.00615453 0.0018872 -3.2612
## V1,V6;V3,V4 0.0017197 0.000806509 2.13227
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V5,V6 -0.0267593 0.00299377 -8.93834
## V1,V5;V3,V6 -0.0288958 0.00255045 -11.3297
## V1,V6;V3,V5 -0.00213647 0.000902038 -2.36849
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V4;V5,V6 -0.0135666 0.00285146 -4.75776
## V1,V5;V4,V6 -0.0174228 0.00239605 -7.27146
## V1,V6;V4,V5 -0.00385617 0.000980944 -3.93108
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V4,V5 0.0205534 0.00285412 7.20133
## V2,V4;V3,V5 0.00755536 0.00254648 2.96698
## V2,V5;V3,V4 -0.0129981 0.00316866 -4.10207
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V4,V6 -0.00626037 0.00142928 -4.3801
## V2,V4;V3,V6 -0.00606571 0.00143319 -4.2323
## V2,V6;V3,V4 0.000194667 0.000975707 0.199514
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V5,V6 -0.0268138 0.00306488 -8.74872
## V2,V5;V3,V6 -0.028807 0.00275725 -10.4477
## V2,V6;V3,V5 -0.00199317 0.000858599 -2.32142
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V4;V5,V6 -0.0136211 0.00270404 -5.03731
## V2,V5;V4,V6 -0.0158089 0.00236794 -6.67621
## V2,V6;V4,V5 -0.00218783 0.00112569 -1.94354
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3,V4;V5,V6 0.0131927 0.00326116 4.04541
## V3,V5;V4,V6 -0.00954853 0.00206454 -4.62502
## V3,V6;V4,V5 -0.0227413 0.00291064 -7.81316



Basic statistics: diversity and differenciation


Summary statistics

As of July 2020, this is done on the thinned vcf file. The part for the SFS is fine in this way. But the statistics should be run on the whole filtered vcf file instead. At least the code is ready.

mkdir ${SUMST}PopGenome_splitchr
mkdir ${SUMST}PopGenome_splitchr/vcf
mkdir ${SUMST}PopGenome_splitchr/gff
mkdir ${SUMST}PopGenome_splitchr/fasta

cd ${SUMST}PopGenome_splitchr

# split into individual chromosome files starting form a complete vcf that contains exactly the loci you want
java -jar /Users/feurtey/Documents/Software/snpEff/SnpSift.jar \
    split ${VCFDIR}${VCFNAME_NOMAF}.recode.vcf

# create directories named after each vcf file, moves each file into its own folder
for x in {1..21} ;
do
  mkdir vcf/$x
  mv ${VCFDIR}${VCFNAME_NOMAF}.recode.${x}.vcf vcf/${x}/${x}.vcf

  mkdir gff/$x
  grep "^${x}\t" ${GFFFILE} > ${SUMST}PopGenome_splitchr/gff/${x}/${x}.gff

~/Documents/Software/samtools-1.10/samtools faidx \
   ${REFFILE} ${x} > fasta/${x}.fa

done
# collect the set of chromosomes to process by scanning the vcf folders
Pop_vcf_dir = paste0(Sumstats_dir, "PopGenome_splitchr/vcf/")
Pop_gff_dir = paste0(Sumstats_dir, "PopGenome_splitchr/gff/")
Pop_ref_dir = paste0(Sumstats_dir, "PopGenome_splitchr/fasta/")

### Define population structure (can even be overlapping among groups if this is necessary)
# define individuals per pop (here it includes also a group that comprises all)

V1 <- as.character(high_anc_coef_snmf %>%
  filter(Cluster == "V1") %>%
  pull(Sample))
V2 <- high_anc_coef_snmf %>%
  filter(Cluster == "V2") %>%
  pull(Sample)
V3 <- high_anc_coef_snmf %>%
  filter(Cluster == "V3") %>%
  pull(Sample)
V4 <- high_anc_coef_snmf %>%
  filter(Cluster == "V4") %>%
  pull(Sample)
V5 <- high_anc_coef_snmf %>%
  filter(Cluster == "V5") %>%
  pull(Sample)
V6 <- high_anc_coef_snmf %>%
  filter(Cluster == "V6") %>%
  pull(Sample)

chr_vec <- system(paste0("ls ", Pop_vcf_dir), intern=T)
PopGenome_results_list = list()
syn_PopGenome_results_list = list()
# use this line below for testing (without running a loop)
#chr <- chr_vec[1]

### Looping through each chromosome, generate and record all statistics

for (chr in chr_vec) {
  # tryCatch aborts the current loop if the "if" statement throws an error (to avoid processing an empty gff file). An empty gff could occur if the current chromosome lacks any annotated genes/features
  tryCatch({

      info <- file.info(paste0(Sumstats_dir, "PopGenome_splitchr/gff/", chr,"/", chr,".gff"))[,"size"]
      if (info==0) stop(paste("No genes on chr", chr,"- skipping!"))

      ### define paths and load data
      vcf.path <- paste0(Pop_vcf_dir, chr)
      gff.path <- paste0(Pop_gff_dir, chr)
      GENOME.class <- readData(vcf.path, format="VCF", include.unknown=TRUE, gffpath=gff.path)
      all <- unlist(get.individuals(GENOME.class))

      # check how many SNPs were loaded using
      GENOME.class@n.biallelic.sites

      # Important: the output only refers to pop1, pop2, pop3 instead of the listed pops below. You must record the correct order of pop names
      GENOME.class <- set.populations(GENOME.class, list(all, V1, V2, V3, V4, V5, V6), diploid = FALSE)
      GENOME.class <- set.synnonsyn(GENOME.class, ref.chr=paste0(Pop_ref_dir, chr,".fa"),save.codons=TRUE)

      # estimate pairwise Fst and creates a slightly more convenient format
      GENOME.class <- F_ST.stats(GENOME.class, mode="nucleotide")
      pairwise.FST <- t(GENOME.class@nuc.F_ST.pairwise)
      pairwise.FST <- as.data.frame(GENOME.class@nuc.F_ST.pairwise) %>%
        mutate(pops = rownames(GENOME.class@nuc.F_ST.pairwise)) %>%
        separate(pops, into = c("Pop1", "Pop2"), sep = "/")
      results = data.frame(CHROM = chr,
                           position = GENOME.class@region.data@biallelic.sites,
                           CodingSNPS = GENOME.class@region.data@CodingSNPS[[1]],
                           synonymous = GENOME.class@region.data@synonymous[[1]],
                           ExonSNPS = GENOME.class@region.data@ExonSNPS[[1]])
      colnames(results) = c("#CHROM", "POS", "CodingSNPS", "synonymous", "ExonSNPS")

      PopGenome_results_list[[chr]] = results

      #Getting statistics from the synonymous positions
      GENOME.class.syn <- neutrality.stats(GENOME.class,subsites="syn")
      GENOME.class.syn  <- diversity.stats(GENOME.class.syn,subsites="syn")

      data.Number_SNPs <- data.frame(GENOME.class.syn@n.segregating.sites, statistic="Number_SNPs", chromosome = chr)
      data.TajD <- data.frame(GENOME.class.syn@Tajima.D,
                              statistic="TajimaD", chromosome = chr)
      data.theta <- data.frame(GENOME.class.syn@theta_Watterson,
                               statistic="theta_Watterson", chromosome = chr)
      data.Nucl_div <- data.frame(GENOME.class.syn@nuc.diversity.within,
                                  statistic="Nuc_Div", chromosome = chr)
      data.Nucl_div_per_site <- data.frame(GENOME.class.syn@nuc.diversity.within/GENOME.class.syn@n.sites,
                                           statistic="Nuc_Div_per_site", chromosome = chr)

      data.full <- rbind(data.Number_SNPs, data.theta,
                         data.TajD, data.Nucl_div, data.Nucl_div_per_site)

      syn_PopGenome_results_list[[chr]] = data.full
  # used to abort loop
  }, error=function(e){})
}   

PopGenome_results_whole = bind_rows(PopGenome_results_list)

PopGenome_results_whole %>%
  filter(synonymous == 1) %>%
  dplyr::select(`#CHROM`, POS) %>%
  write_tsv(paste0(Sumstats_dir, "Synonymous_SNPs.list.txt"), col_names = F)

syn_PopGenome_results_per_chr = bind_rows(syn_PopGenome_results_list) %>%
  na_if(., "NaN") %>%
  dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
  pivot_longer(-c(statistic, chromosome), names_to = "Population", values_to = "Estimate")
p1 = syn_PopGenome_results_per_chr %>%
  filter(statistic == "TajimaD") %>%
  ggplot(aes(x = Population, y = as.numeric(chromosome), fill = Estimate)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs (x = "Populations", y = "Chromosome",
        title = "Tajima's D in synonymous positions",
        subtitle = str_wrap(paste(""), width = 70),
        fill = "Tajima's D")+
  theme_light()

p2 = syn_PopGenome_results_per_chr %>%
  filter(statistic == "Nuc_Div_per_site") %>%
  ggplot(aes(x = Population, y = as.numeric(chromosome), fill = Estimate)) +
  geom_tile() +
  scale_fill_viridis_c(option = "magma") +
  labs (x = "Populations", y = "Chromosome",
        title = str_wrap(paste("Nucleotide diversity per site ",
                               "in synonymous positions"), width = 40),
        subtitle = str_wrap(paste(""), width = 70),
        fill = "Nucleotide diversity")+
  theme_light()
PopGenome_results_list = list()

### Looping through each chromosome, generate and record all statistics

for (chr in chr_vec) {
  # tryCatch aborts the current loop if the "if" statement throws an error (to avoid processing an empty gff file). An empty gff could occur if the current chromosome lacks any annotated genes/features
    tryCatch({


# skip the loop if the gff file is empty! Empty gff files can happen if a chromosome has no annotated genes.
info <- file.info(paste0(project_dir, "PopGenome_splitchr/gff/", chr,"/", chr,".gff"))[,"size"]
if (info==0) stop(paste("No genes on chr", chr,"- skipping!"))

### define paths and load data
vcf.path <- paste0(Pop_vcf_dir, chr)
gff.path <- paste0(Pop_gff_dir, chr)
GENOME.class <- readData(vcf.path, format="VCF", include.unknown=TRUE, gffpath=gff.path)
all <- unlist(get.individuals(GENOME.class))

# check how many SNPs were loaded using
GENOME.class@n.biallelic.sites



# Important: the output only refers to pop1, pop2, pop3 instead of the listed pops below. You must record the correct order of pop names
GENOME.class <- set.populations(GENOME.class, list(all, V1, V2, V3, V4, V5, V6), diploid = FALSE)
GENOME.class <- set.synnonsyn(GENOME.class, ref.chr=paste0(Pop_ref_dir, chr,".fa"),save.codons=TRUE)
# check assignment of populations using GENOME.class@populations (optional)

### split GENOME.class into genes (further options are: exon, etc.), feature must be mentioned in GFF file!
GENOME.class.split <- splitting.data(GENOME.class, subsites="gene")

### calculate summary stats per gene
GENOME.class.split <- neutrality.stats(GENOME.class.split)
GENOME.class.split <- diversity.stats(GENOME.class.split)

### Build dataframes with summary stats per gene and population
data.Number_SNPs <- data.frame(GENOME.class.split@n.segregating.sites, statistic="Number_SNPs", chromosome = chr, position=GENOME.class.split@region.names)
data.TajD <- data.frame(GENOME.class.split@Tajima.D, statistic="TajimaD", chromosome = chr, position=GENOME.class.split@region.names)
data.Nucl_div <- data.frame(GENOME.class.split@nuc.diversity.within, statistic="Nuc_Div", chromosome = chr, position=GENOME.class.split@region.names)
data.Nucl_div_per_site <- data.frame(GENOME.class.split@nuc.diversity.within/GENOME.class.split@n.sites, statistic="Nuc_Div_per_site", chromosome = chr, position=GENOME.class.split@region.names)

# syn, non-syn counts per gene (value = 0 equals non-synonymous change, value = 1 equals syn. change)
syn.count <- sapply(GENOME.class.split@region.data@synonymous, function(x) {sum(x == 1, na.rm = T)})
nonsyn.count <- sapply(GENOME.class.split@region.data@synonymous, function(x) {sum(x == 0, na.rm = T)})

data.Number_synSNPs <- data.frame(pop.1=syn.count, pop.2="NA", pop.3="NA", pop.4="NA",
                                  pop.5="NA", pop.6="NA", pop.7="NA",
                                  statistic="Number_synSNPs", chromosome = chr,
                                  position=GENOME.class.split@region.names)
data.Number_nonsynSNPs <- data.frame(pop.1=nonsyn.count, pop.2="NA", pop.3="NA", pop.4="NA",
                                  pop.5="NA", pop.6="NA", pop.7="NA",
                                  statistic="Number_nonsynSNPs", chromosome = chr,
                                  position=GENOME.class.split@region.names)

data.full <- rbind(data.Number_SNPs, data.Number_synSNPs, data.Number_nonsynSNPs,
                   data.TajD, data.Nucl_div, data.Nucl_div_per_site)

PopGenome_results_list[[chr]] = data.full
  # used to abort loop
  }, error=function(e){})
}   

PopGenome_results_per_gene = bind_rows(PopGenome_results_list)
rm(PopGenome_results_list)
# Tajima's D
temp = PopGenome_results_per_gene %>%
  dplyr::filter(statistic == "TajimaD") %>%
  na_if(., "NaN") %>%
  dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
  pivot_longer(-c(statistic, chromosome, position), names_to = "Population", values_to = "Estimate") %>%
  separate(position, into = c("start", "stop"))

temp_sum = temp %>% group_by(Population) %>%
  dplyr::summarize(Median_value = median(Estimate, na.rm = T))

temp = full_join(temp,temp_sum)

p3 = ggplot(temp, aes(x = Population, y = as.numeric(Estimate), fill = Median_value)) +
  geom_violin() +
  geom_boxplot(width = 0.1,
                 outlier.shape = NA, color = "white")+
  labs (x = "", y = "Tajima's D",
        title = "Tajima's D per population in genes",
        subtitle = str_wrap(paste(""), width = 70),
        fill = "Median per pop") +
  theme_light() +
  scale_fill_viridis_c()


# Nucleotide diversity
temp = PopGenome_results_per_gene %>%
  dplyr::filter(statistic == "Nuc_Div_per_site") %>%
  na_if(., "NaN") %>%
  dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
  pivot_longer(-c(statistic, chromosome, position), names_to = "Population", values_to = "Estimate") %>%
  separate(position, into = c("start", "stop"))  

temp_sum = temp %>% group_by(Population) %>%
  dplyr::summarize(Median_value = median(Estimate, na.rm = T))

temp = full_join(temp,temp_sum)
p4 = ggplot(temp, aes(x = Population, y = log(as.numeric(Estimate)), fill = Median_value)) +
  geom_violin() +
  labs (x = "", y = str_wrap(paste("Nucleotide diversity per site",
                                   "in log scale"), width = 30),
        title = str_wrap(paste("Nucleotide diversity per site ",
                               "per population in genes"), width = 50),
        subtitle = str_wrap(paste(""), width = 70),
        fill = "Median per pop")+
  theme_light() +
  scale_fill_viridis_c(option = "magma")+
  geom_boxplot(width = 0.1,
                 outlier.shape = NA, color = "white")
plot_grid(p1, p2, p3, p4, labels = c('A', 'B', 'C', 'D'), label_size = 12, ncol = 2)

Site frequency spectrum


#Filter vcf file for only synonymous SNPs
vcftools \
  --vcf ${VCFDIR}${VCFNAME_NOMAF}.recode.vcf \
  --positions ${SUMST}Synonymous_SNPs.list.txt \
  --remove-filtered-all \
  --keep $ZTLIST.txt \
  --non-ref-ac-any 1 \
  --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
  --recode \
  --out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn

#Create one count file per population
for x in {1..6} ;
do
cut -f 1,2 ${POPSTR}${VCFNAME}.pass_noNA.high_anc_coef_snmf.tsv \
  | grep -w "V${x}" | cut -f 1 \
  > ${SUMST}${VCFNAME}.pass_noNA.high_anc_coef_snmf.pop${x}.ind

vcftools \
  --vcf ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.recode.vcf \
  --keep ${SUMST}${VCFNAME}.pass_noNA.high_anc_coef_snmf.pop${x}.ind \
  --remove-filtered-all \
  --counts \
  --out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.high_anc_coef_snmf.pop${x}

done


#TODO: Filter vcf file for monomorphic SNP in outgroup and use to create dadi output?
/Users/feurtey/Documents/Software/vcftools_jydu/src/cpp/vcftools \
  --vcf ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.recode.vcf \
  --recode-INFO-all --recode \
    --out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi \
    --max-indv 1


/Users/feurtey/Documents/Software/bcftools/bcftools query -f '%CHROM\t%POS\t%REF[\t%TGT]\n' \
  ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi.recode.vcf > \
  ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi.tab
SFS_counts_list = list()

for (x in c(1:6)) {
count_file = paste0(Sumstats_dir, vcf_name_nomaf,
                    ".pass_noNA.high_anc_coef_snmf.pop",
                    x,".frq.count")

counts = read_tsv(count_file,
                  skip = 1, col_names = F) %>%
         separate(X5, into = c("REF allele", "REF_count")) %>%
         separate(X6, into = c("ALT allele", "ALT_count")) %>%
         dplyr::mutate(MAC = pmin(as.numeric(REF_count), as.numeric(ALT_count))) %>%
         dplyr::mutate(Pop = x)
SFS_counts_list[[x]] = counts
}
SFS_counts = bind_rows(SFS_counts_list)

SFS_counts %>%
  filter(MAC > 0) %>%
  dplyr::group_by(Pop, MAC) %>%
  dplyr::count(name = "count") %>%
  ggplot(aes(as.numeric(x = as.numeric(MAC)), y = count)) +
    geom_bar(stat = "identity") +
    labs(title = "Site frequency spectrum from the different populations") +
    theme_bw() +
    facet_wrap(Pop~., scales = "free")



Repeat-induced point mutations and transposable elements


Previously, based on the study of TE and RIP in 9 Z.tritici genomes, a hypothesis was drawn. Repeat Induced Point (RIP) mutations in transposons of Zymoseptoria spp. Figure from Lorrain et al. 2020: Repeat Induced Point (RIP) mutations in transposons of Zymoseptoria spp. Histograms of Composite RIP index (CRI) frequencies of transposons estimated using a 50bp sliding windows approach as follows: CRI =(TpA/ ApT) – (CpA + TpG/ ApC + GpT) for A) Z. passerinii, Z. ardabiliae, Z. brevis and Z. pseudotritici, and B) Iranian Z. tritici isolates and C) European Z. tritici isolates. Vertical dash lines exhibit the threshold (0) above which CRI values indicate a RIP signature.

The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.

RIP and TE content between populations

The data plotted here are based on the following steps:

  • Map the reads on TE consensus (from Lorrain et al. 2020, so it includes only Iranian and European samples) and create two bins: reads mapping on TEs and reads that are not mapping.
  • Measuring the RIP composite index for reads that mapped on TE.
  • Estimating the index median in TE reads per isolate.

#Run detection of TE/RIP
dir=/legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/SRA_2019/PE/Bam/ ;

dir=/legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/MM_NZ_TAS/Bam/
list_name=$(ls ${dir}*bai | sed "s|${dir}||" | sed 's/.bam.bai//' )

for x in $list_name ; do
  echo $x;
  sbatch TE_GC_RIP_bowtie.sh  $x ${dir} ;
  sleep 2m
done



#Once everything is finished from above
#--------------------------------------
#Gather all results
grep "Compo" /data3/alice/WW_project/WW_TE_RIP/0_RIP_estimation/3_RIP_estimation/*txt  | \
  sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
  sed 's/.txt:Composite//' | sed 's/\./ /' \
  > /userhome/alice/WW_project/WW_TE_RIP/Composite_index.txt


#Gathering with too many files
rm /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
grep "Compo" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*RIP_est.txt | \
  sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
  sed 's/.txt:Composite//' | sed 's/\./\t/' \
  >> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt

grep "GC" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*RIP_est.txt | \
  sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
  sed 's/.txt:GC//' | sed 's/\./\t/' \
  >> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/GC_percent.txt

REF_NAME=/data2/alice/WW_project/0_Data/Badet_BMC_Biology_2020_TE_consensus_sequences
ls_TE=$(grep ">" ${REF_NAME}.fasta | sed 's/>//')
for TE in $ls_TE;
do
  grep "Compo" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*${TE}.txt  | \
  sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
  sed 's/.txt:Composite//' | sed 's/\./\t/' \
  >> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
done



#Get read number per sample
echo "ID_file Te_aligned_reads Total_reads" > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads.txt
for fq in /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/*fq.gz ;  
do  
  name=$(echo $fq | sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/||' | sed 's/.fq.gz//' );
  echo $name
  total_reads=$(~/Software/samtools-1.10/samtools flagstat /data2/alice/WW_project/1_Variant_calling/0_Mappings/${name}.bam | grep "mapped" | grep "with" -v | cut -f1 -d " ");    
  aln_reads=$(~/Software/samtools-1.10/samtools flagstat /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/${name}.bam | grep "mapped" | grep "with" -v | cut -f1 -d " ");
  #aln_reads=$(zcat /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/${name}.fq.gz | paste - - - - | wc -l | cut -f 1);  
  echo $name $aln_reads $total_reads >> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads.txt;
done

#Get read number per sample per TE
rm /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads_per_TE.txt
for bamF in /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/*bam ;  
do
  echo $bamF ;  
  name=$(echo $bamF | sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/||' | sed 's/.bam//' ) ;
  /home/alice/Software/samtools-1.10/samtools idxstats $bamF > temp.txt ;  
  awk -v var="$name"  '{OFS="\t"} {print var, $1, $2, $3, $4}' temp.txt >> \
    /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads_per_TE.txt ;
done

First, I mapped the reads on the TE consensus created by Ursula based on Thomas’s pangenome. I visualize the results here in terms of percentage of reads mapping on these consensus.

#Reading in the data
TE_qty = read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
  dplyr::filter(Total_reads > Te_aligned_reads) %>%
  dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads) %>%
  left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000)) %>%
  unite(Continent_Region, Country, ID_file, col = "for_display", remove = F)

world_avg <-
  TE_qty  %>%
  dplyr::summarize(avg = mean(as.numeric(Percent_TE_Reads), na.rm = T)) %>%
  pull(avg)

#Building the basic violin plot per continent
TE_prop =  TE_qty %>%
  filter(!is.na(Continent_Region)) %>%
  ggplot(aes(x = Continent_Region, y = Percent_TE_Reads, fill = Continent_Region)) +
  geom_hline(aes(yintercept = world_avg),
             color = "gray30", size = 0.6, linetype = "dashed") +
    geom_violin(alpha = .8) +
  stat_summary(fun = mean, geom = "point", size = 2, color = "grey30") +
    theme_classic() + Fill_Continent + Color_Continent +
  theme_cowplot()  +
    theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = str_wrap("Percentage of reads", width = 30),
        title = "Amount of reads mapping on TE consensus per continent",
        subtitle = str_wrap(paste(""), width = 70))

#TE_prop


#One-way ANOVA with blocks
##Define linear model
model = lm(Percent_TE_Reads ~ Continent_Region + Collection ,
          data=TE_qty)
summary(model)   ### Will show overall p-value and r-squared
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Continent_Region + Collection, 
##     data = TE_qty)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3156 -1.2556 -0.1235  1.1336  8.2952 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      17.15354    0.91848  18.676  < 2e-16 ***
## Continent_RegionAsia             -0.35553    1.54017  -0.231 0.817550    
## Continent_RegionEurope            0.22300    0.57541   0.388 0.698543    
## Continent_RegionMiddle East      -1.89939    0.62658  -3.031 0.002579 ** 
## Continent_RegionNorth America     0.94382    0.59419   1.588 0.112917    
## Continent_RegionOceania           2.54310    0.69750   3.646 0.000298 ***
## Continent_RegionSouth America     1.26063    0.76898   1.639 0.101859    
## CollectionC2                     -0.02222    0.86040  -0.026 0.979405    
## CollectionC3                     -0.67798    0.96048  -0.706 0.480638    
## CollectionHartmann               -7.54541    0.77820  -9.696  < 2e-16 ***
## CollectionJGI                     3.83328    0.77146   4.969 9.68e-07 ***
## CollectionMM_NZ_TAS               3.49334    0.87795   3.979 8.10e-05 ***
## CollectionThird_batch_2018_BM_TM -0.77949    0.78368  -0.995 0.320459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.025 on 437 degrees of freedom
##   (75 observations deleted due to missingness)
## Multiple R-squared:  0.8833, Adjusted R-squared:  0.8801 
## F-statistic: 275.6 on 12 and 437 DF,  p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: Percent_TE_Reads
##                  Sum Sq  Df F value    Pr(>F)    
## Continent_Region  423.4   6   17.21 < 2.2e-16 ***
## Collection       8381.2   6  340.69 < 2.2e-16 ***
## Residuals        1791.7 437                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = Percent_TE_Reads ~ Continent_Region + Collection, 
##     data = TE_qty)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3156 -1.2556 -0.1235  1.1336  8.2952 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      17.15354    0.91848  18.676  < 2e-16 ***
## Continent_RegionAsia             -0.35553    1.54017  -0.231 0.817550    
## Continent_RegionEurope            0.22300    0.57541   0.388 0.698543    
## Continent_RegionMiddle East      -1.89939    0.62658  -3.031 0.002579 ** 
## Continent_RegionNorth America     0.94382    0.59419   1.588 0.112917    
## Continent_RegionOceania           2.54310    0.69750   3.646 0.000298 ***
## Continent_RegionSouth America     1.26063    0.76898   1.639 0.101859    
## CollectionC2                     -0.02222    0.86040  -0.026 0.979405    
## CollectionC3                     -0.67798    0.96048  -0.706 0.480638    
## CollectionHartmann               -7.54541    0.77820  -9.696  < 2e-16 ***
## CollectionJGI                     3.83328    0.77146   4.969 9.68e-07 ***
## CollectionMM_NZ_TAS               3.49334    0.87795   3.979 8.10e-05 ***
## CollectionThird_batch_2018_BM_TM -0.77949    0.78368  -0.995 0.320459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.025 on 437 degrees of freedom
##   (75 observations deleted due to missingness)
## Multiple R-squared:  0.8833, Adjusted R-squared:  0.8801 
## F-statistic: 275.6 on 12 and 437 DF,  p-value: < 2.2e-16
hist(residuals(model), col="darkgray")

#Post-hoc analysis:  mean separation tests
library(multcomp)
library(lsmeans)

marginal = lsmeans(model, ~ Continent_Region)

pairs(marginal, adjust="tukey")
##  contrast                      estimate    SE  df t.ratio p.value
##  Africa - Asia                    0.356 1.540 437  0.231  1.0000 
##  Africa - Europe                 -0.223 0.575 437 -0.388  0.9997 
##  Africa - Middle East             1.899 0.627 437  3.031  0.0410 
##  Africa - North America          -0.944 0.594 437 -1.588  0.6899 
##  Africa - Oceania                -2.543 0.697 437 -3.646  0.0055 
##  Africa - South America          -1.261 0.769 437 -1.639  0.6568 
##  Asia - Europe                   -0.579 1.467 437 -0.394  0.9997 
##  Asia - Middle East               1.544 1.497 437  1.031  0.9466 
##  Asia - North America            -1.299 1.485 437 -0.875  0.9761 
##  Asia - Oceania                  -2.899 1.528 437 -1.897  0.4832 
##  Asia - South America            -1.616 1.547 437 -1.044  0.9433 
##  Europe - Middle East             2.122 0.385 437  5.507  <.0001 
##  Europe - North America          -0.721 0.330 437 -2.183  0.3071 
##  Europe - Oceania                -2.320 0.477 437 -4.860  <.0001 
##  Europe - South America          -1.038 0.612 437 -1.695  0.6196 
##  Middle East - North America     -2.843 0.361 437 -7.870  <.0001 
##  Middle East - Oceania           -4.442 0.484 437 -9.178  <.0001 
##  Middle East - South America     -3.160 0.665 437 -4.751  0.0001 
##  North America - Oceania         -1.599 0.440 437 -3.632  0.0058 
##  North America - South America   -0.317 0.635 437 -0.499  0.9989 
##  Oceania - South America          1.282 0.732 437  1.751  0.5818 
## 
## Results are averaged over the levels of: Collection 
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "tukey")  ### Tukey-adjusted p-values

CLD
##  Continent_Region lsmean    SE  df lower.CL upper.CL .group
##  Middle East        15.0 0.358 437     14.0     16.0  a    
##  Asia               16.6 1.465 437     12.6     20.5  abc  
##  Africa             16.9 0.568 437     15.4     18.4   b   
##  Europe             17.1 0.211 437     16.6     17.7   b   
##  North America      17.9 0.298 437     17.1     18.7   b   
##  South America      18.2 0.606 437     16.5     19.8   bc  
##  Oceania            19.5 0.403 437     18.4     20.5    c  
## 
## Results are averaged over the levels of: Collection 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 7 estimates 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## significance level used: alpha = 0.05
CLD$.group=gsub(" ", "", CLD$.group)

### Plot
TE_prop +
  geom_text(data = CLD, aes(x = Continent_Region, label = .group, y = 34), color   = "black")

The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounging factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.

I would like to go finer in the TE content analysis and look at the reads aligning on each consensus sequence.

reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
                    col_names = c("ID_file", "TE", "Length",
                                  "# mapped read-segments",  "# unmapped read-segments")) %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F) %>%
  dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
  left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000)) %>%
  unite(Continent_Region, Country, ID_file, col = "for_display", remove = F)

temp = reads_per_TE %>% group_by(ID_file) %>%
       dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))

reads_per_TE = left_join(reads_per_TE, temp) %>%
  dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)



#reads_per_TE %>%
#  mutate(ID_file = fct_reorder(ID_file, Continent_Region)) %>%
#  ggplot(aes(x = ID_file, y = Normalized_nb_reads_mapped, fill = Superfamily)) +
#    geom_bar(stat = "identity")

Let’s try to make a PCA based on the proportion of reads mapping on each TE. Is there a geographical clustering in the TE content?

TE_PCA_mat = reads_per_TE %>%
  dplyr::select(ID_file, Continent_Region, for_display, TE, Normalized_nb_reads_mapped) %>%
  spread(key = TE, value = as.numeric(Normalized_nb_reads_mapped))

temp =as.matrix(sapply(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))], as.numeric))
temp = temp[,apply(temp, 2, var, na.rm=TRUE) != 0]

TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)
#summary(TE.pca)

#library(ggfortify)
#autoplot(TE.pca, data = TE_PCA_mat, colour = 'Continent_Region', shape = T, label.size = 3)

p1 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
  ggplot(aes(x = PC1, y = PC2, col = Continent_Region, text = for_display)) +
  geom_point(alpha = 0.6) + theme_bw() + Color_Continent +
  labs(title = "PCA based on normalized reads mapping on each TE consensus")
ggplotly(p1)
p2 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
  ggplot(aes(x = PC2, y = PC3, col = Continent_Region, text = for_display)) +
  geom_point(alpha = 0.6) + theme_bw() + Color_Continent
ggplotly(p2)
p3 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
  ggplot(aes(x = PC3, y = PC4, col = Continent_Region, text = for_display)) +
  geom_point(alpha = 0.6) + theme_bw() + Color_Continent
ggplotly(p3)

Let’s now plot the same results but all 4 axis against all and not interactive.

#And now all against all but not interactive
temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
  dplyr::select(for_display, Continent_Region, PC1, PC2, PC3, PC4)

p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Continent_Region, fill = Continent_Region, alpha = 0.6),
            title = "PCA based on normalized reads mapping on each TE consensus",
            upper = list(continuous = "points", combo = "box_no_facet"))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
  }
}

p

It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.

I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.

RIP=read_tsv(paste0(RIP_DIR, "Composite_index.txt"),
             col_names = c("ID_file", "TE",  "Composite_median", "Composite_mean" )) %>%
  separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F) %>%
  mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
  left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000)) %>%
  unite(Continent_Region, Country, ID_file, col = "for_display", remove = F) %>%
  left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!

#Per continent

world_RIP_avg <-
  RIP %>%
  filter(TE == "RIP_est") %>%
  dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
  pull(avg)

temp = RIP %>%
  filter(TE == "RIP_est") %>%
  filter(!is.na(Continent_Region)) %>%
  group_by(Continent_Region) %>%
  dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T))

RIP_plot = ggplot(temp, aes(x = Continent_Region,
                            y = as.numeric(Composite_median),
                            color = Continent_Region))   +
  coord_flip() +
  geom_segment(aes(x = Continent_Region, xend = Continent_Region,
        y = world_RIP_avg, yend = region_avg), size = 0.8) +
  geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
  geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
  stat_summary(fun = mean, geom = "point", size = 5) +
  Color_Continent +
  theme_cowplot() +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  labs (x = "", y = "RIP composite index",
        title = "RIP levels per continents",
        subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
                                  "are high in the Middle-East",
                                  "and low in North America in particular."), width = 70))


#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Continent_Region + Collection ,
          data=temp)
summary(model)   ### Will show overall p-value and r-squared
## 
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent_Region + 
##     Collection, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50684 -0.04543  0.00169  0.04413  0.38740 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.551853   0.045539  34.077  < 2e-16 ***
## Continent_RegionAsia             -0.094539   0.083608  -1.131   0.2588    
## Continent_RegionEurope           -0.159444   0.031231  -5.105 4.92e-07 ***
## Continent_RegionMiddle East       0.268819   0.033998   7.907 2.13e-14 ***
## Continent_RegionNorth America    -0.378820   0.032239 -11.750  < 2e-16 ***
## Continent_RegionOceania          -0.298857   0.037481  -7.974 1.33e-14 ***
## Continent_RegionSouth America    -0.172791   0.041745  -4.139 4.17e-05 ***
## CollectionC2                      0.010863   0.042068   0.258   0.7964    
## CollectionC3                      0.001281   0.048029   0.027   0.9787    
## CollectionHartmann               -0.237595   0.037030  -6.416 3.60e-10 ***
## CollectionJGI                     0.111332   0.036620   3.040   0.0025 ** 
## CollectionMM_NZ_TAS               0.189711   0.042851   4.427 1.20e-05 ***
## CollectionThird_batch_2018_BM_TM  0.077286   0.037390   2.067   0.0393 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1099 on 442 degrees of freedom
##   (107 observations deleted due to missingness)
## Multiple R-squared:  0.826,  Adjusted R-squared:  0.8213 
## F-statistic: 174.9 on 12 and 442 DF,  p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: as.numeric(Composite_median)
##                   Sum Sq  Df F value    Pr(>F)    
## Continent_Region 14.0300   6  193.52 < 2.2e-16 ***
## Collection        9.8296   6  135.59 < 2.2e-16 ***
## Residuals         5.3407 442                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent_Region + 
##     Collection, data = temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50684 -0.04543  0.00169  0.04413  0.38740 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.551853   0.045539  34.077  < 2e-16 ***
## Continent_RegionAsia             -0.094539   0.083608  -1.131   0.2588    
## Continent_RegionEurope           -0.159444   0.031231  -5.105 4.92e-07 ***
## Continent_RegionMiddle East       0.268819   0.033998   7.907 2.13e-14 ***
## Continent_RegionNorth America    -0.378820   0.032239 -11.750  < 2e-16 ***
## Continent_RegionOceania          -0.298857   0.037481  -7.974 1.33e-14 ***
## Continent_RegionSouth America    -0.172791   0.041745  -4.139 4.17e-05 ***
## CollectionC2                      0.010863   0.042068   0.258   0.7964    
## CollectionC3                      0.001281   0.048029   0.027   0.9787    
## CollectionHartmann               -0.237595   0.037030  -6.416 3.60e-10 ***
## CollectionJGI                     0.111332   0.036620   3.040   0.0025 ** 
## CollectionMM_NZ_TAS               0.189711   0.042851   4.427 1.20e-05 ***
## CollectionThird_batch_2018_BM_TM  0.077286   0.037390   2.067   0.0393 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1099 on 442 degrees of freedom
##   (107 observations deleted due to missingness)
## Multiple R-squared:  0.826,  Adjusted R-squared:  0.8213 
## F-statistic: 174.9 on 12 and 442 DF,  p-value: < 2.2e-16
hist(residuals(model), col="darkgray")

#Post-hoc analysis:  mean separation tests
marginal = lsmeans(model, ~ Continent_Region)

pairs(marginal, adjust="tukey")
##  contrast                      estimate     SE  df t.ratio p.value
##  Africa - Asia                   0.0945 0.0836 442   1.131 0.9184 
##  Africa - Europe                 0.1594 0.0312 442   5.105 <.0001 
##  Africa - Middle East           -0.2688 0.0340 442  -7.907 <.0001 
##  Africa - North America          0.3788 0.0322 442  11.751 <.0001 
##  Africa - Oceania                0.2989 0.0375 442   7.974 <.0001 
##  Africa - South America          0.1728 0.0417 442   4.139 0.0008 
##  Asia - Europe                   0.0649 0.0796 442   0.815 0.9834 
##  Asia - Middle East             -0.3634 0.0813 442  -4.472 0.0002 
##  Asia - North America            0.2843 0.0806 442   3.528 0.0084 
##  Asia - Oceania                  0.2043 0.0828 442   2.468 0.1737 
##  Asia - South America            0.0783 0.0840 442   0.932 0.9673 
##  Europe - Middle East           -0.4283 0.0209 442 -20.474 <.0001 
##  Europe - North America          0.2194 0.0179 442  12.240 <.0001 
##  Europe - Oceania                0.1394 0.0255 442   5.473 <.0001 
##  Europe - South America          0.0133 0.0332 442   0.402 0.9997 
##  Middle East - North America     0.6476 0.0196 442  33.023 <.0001 
##  Middle East - Oceania           0.5677 0.0259 442  21.898 <.0001 
##  Middle East - South America     0.4416 0.0361 442  12.234 <.0001 
##  North America - Oceania        -0.0800 0.0235 442  -3.401 0.0129 
##  North America - South America  -0.2060 0.0345 442  -5.976 <.0001 
##  Oceania - South America        -0.1261 0.0394 442  -3.199 0.0247 
## 
## Results are averaged over the levels of: Collection 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD_RIP = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "tukey")  ### Tukey-adjusted p-values

CLD_RIP
##  Continent_Region lsmean     SE  df lower.CL upper.CL .group
##  North America      1.19 0.0159 442     1.15     1.24  a    
##  Oceania            1.27 0.0213 442     1.22     1.33   b   
##  South America      1.40 0.0327 442     1.31     1.49    c  
##  Europe             1.41 0.0111 442     1.38     1.44    c  
##  Asia               1.48 0.0795 442     1.26     1.69   bcd 
##  Africa             1.57 0.0307 442     1.49     1.66     d 
##  Middle East        1.84 0.0192 442     1.79     1.89      e
## 
## Results are averaged over the levels of: Collection 
## Results are given on the as.numeric (not the response) scale. 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 7 estimates 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## significance level used: alpha = 0.05
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)

text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))

RIP_plot +
  geom_text(data = CLD_RIP, aes(x = Continent_Region,
                            y = text_y,
                            label = .group), color = "black")

It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.

#Per  TE superfamily
RIP  %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily)%>%
  mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
  ggplot(aes(x = Superfamily,
             y = as.numeric(Composite_median),
             fill = median_Superfamily)) +
    geom_boxplot(outlier.shape = NA) +
    theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 40, hjust = 1)) +
  ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")

temp = RIP  %>%
  filter(Normalized_nb_reads_mapped > 0.0001) %>%
  group_by(Superfamily, Continent_Region, Order)%>%
  dplyr::summarize(median_Superfamily=median(Composite_median, na.rm = T) )

temp %>%
  filter(Order == "Class I (retrotransposons)") %>%
  ggplot(aes(x = Superfamily,
             y = median_Superfamily,
             fill = Continent_Region)) +
    geom_bar(stat = "identity", position=position_dodge()) +
  Fill_Continent +    
  theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))

temp %>%
  filter(Order == "Class II (DNA transposons)") %>%
  ggplot(aes(x = Superfamily,
             y = median_Superfamily,
             fill = Continent_Region)) +
    geom_bar(stat = "identity", position=position_dodge()) +
  Fill_Continent +    
  theme_bw() +
    ylab("Median of composite index on TE reads") +
    theme(axis.text.x = element_text(angle = 40, hjust = 1))

#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated  by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_tsv(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
             col_names = c("TE",  "length",  "offset",  
             "number of bases per line",  "number of bytes per line"))

p = inner_join(RIP, TE_consensus_faidx) %>%
  dplyr::group_by(Order, TE, length) %>%
  filter (TE != "RIP_est") %>%
  filter(!is.na(Normalized_nb_reads_mapped)) %>%
  dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
                   read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
  ggplot(aes(x = log(length),
             y = median_per_consensus,
             color = Order)) +
  geom_point(aes( text = TE,
             alpha = log(read_mapped)))  + theme_bw() +
  ylim(c(-2, 4)) + geom_hline(yintercept = 0) +
  labs(x = "TE length (log-scale)",
       y = "Median of RIP composite index",
       title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
                        "against the length of the consensus sequence"), width = 60)) # + geom_smooth(span = 1.5, fill = NA, size =0.7)

ggplotly(p)
p = inner_join(RIP, TE_consensus_faidx) %>%
  filter (TE != "RIP_est") %>%
  dplyr::group_by(Order, TE, length) %>%
  dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
                   read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
  filter(str_detect(TE, pattern = "SINE") | str_detect(TE, pattern = "MITE") ) %>%
  ggplot(aes(x = log(length),
             y = median_per_consensus,
             color = Order, text = TE,
             alpha = log2(read_mapped))) +
  geom_point()  + theme_bw() +
  ylim(c(-2, 4)) + geom_hline(yintercept = 0)

ggplotly(p)

Finally, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected. I also investigate several possible bias.

TE_RIP = inner_join(TE_qty, RIP %>%
  filter(TE == "RIP_est") )

TE_RIP$Date_Year[is.na(TE_RIP$Date_Year)] <- "Unknown"

temp = TE_RIP %>%
  group_by(Continent_Region) %>%
  dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
                  Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
  dplyr::mutate(for_display = Continent_Region)

#TE and RIP together
t = ggplot(TE_RIP, aes(as.numeric(Percent_TE_Reads),
                          as.numeric(Composite_median),
                          color = Continent_Region,
                          text = for_display))+
  theme_cowplot()  +
  geom_point(alpha = 0.6) + Color_Continent +
  labs(color = "Geographical group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "Amount of transposable elements vs RIP level") +
  geom_point(data = temp, aes(as.numeric(TE_qty),
                                as.numeric(Composite_median),
                                color = Continent_Region), size = 5)
ggplotly(t)
t

bias1 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Collection, text = for_display)) +
  theme_cowplot() +
  geom_point(alpha = 0.8)

#bias2 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Library_strategy, text = for_display)) +
#  theme_cowplot() +
#  geom_point(alpha = 0.8)

ggplotly(bias1)
cowplot::plot_grid(RIP_plot +
                     labs (title = "", subtitle = "") +
                     geom_text(data = CLD_RIP, aes(x = Continent_Region,
                                               y = text_y,
                                               label = .group),
                               color = "black"),
                   TE_prop +
                     labs (title = "", subtitle = "") +
                     geom_text(data = CLD, aes(x = Continent_Region,
                                               label = .group,
                                               y = 34),
                               color   = "black") +
                     theme(legend.position = "none") +
                     coord_flip())

subset = TE_RIP %>%
  filter(Collection != "Hartmann")

temp = subset %>%
  group_by(Continent_Region) %>%
  dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
                  Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
  dplyr::mutate(for_display = Continent_Region)

t = ggplot(subset, aes(as.numeric(Percent_TE_Reads),
             as.numeric(Composite_median),
             color = Continent_Region,
             text = for_display)) +
  theme_cowplot()  +
  geom_point(alpha = 0.6) + Color_Continent +
  labs(color = "Geographical group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "Amount of transposable elements vs RIP level") +
  geom_point(data = temp, aes(as.numeric(TE_qty),
                                as.numeric(Composite_median),
                                color = Continent_Region), size = 5)
ggplotly(t)

Besides geographical origin and collection, I also wanted to check time for subsets of the genomes. The Oceanian samples have often shown their own pattern, so I looked at them separately. Additionally, I am curious about the North American samples since Ursula saw a temporal pattern.

#Checking the Oceanian samples
TE_RIP %>%
  filter(Continent_Region == "Oceania") %>%
  ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = as.character(Date_Year), shape = Country, text = for_display)) +
  geom_point(alpha = 0.8) +
  theme_cowplot() +
      scale_color_manual(values = blues) +
  labs(color = "Group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "TE vs RIP level in Oceania",
       subtitle = str_wrap(paste0("Oceanian samples have shown a clear ",
                                  "temporal and geographical substructure ",
                                  "based on SNPs. How about RIP/TE content?"),
                                  width = 70))

t = TE_RIP %>%
    filter(Country == "United States") %>%
    ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median),
               color = Collection, shape = as.character(Date_Year))) +
    geom_point(alpha = 0.8) +
    theme_bw() +
  labs(color = "Geographical group",
       x = "Percentage of TE reads", y = "RIP composite median",
       title = "TE vs RIP level in North America",
       subtitle = str_wrap(paste0("Ursula's data shows a geographical and a ",
                                  "temporal pattern in the TE content. ",
                                  "Can this be recovered here?"),
                                  width = 70))
t

The RIP index does seem consistent so far with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).

Careful, there is a strong difference between the data from the Hartmann dataset and the rest. Let’s investigate a potential GC bias in the sequencing.

#The coverage files here are produced by bedtools coverage with the hist option using the aligned bam files and a bed file describing 1kb windows.
# Example loop:
#for sample in STnnJGI_SRR4235066 STnnJGI_SRR4235068 STnnJGI_SRR4235067 STnnJGI_SRR4235068 ; do rsync -avP /legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/SRA_2019/PE/Bam/${sample}.bam ./ ; ~/Software/bedtools coverage -a Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.1kb.bed -b ${sample}.bam -hist > ${sample}.coverage.txt ; done
#And gathered like this:
  #for file in *.coverage.txt ; do sample=$(echo $file | sed 's/.coverage.txt//') ; echo $sample ; awk -v #var=$sample '{print var, $1, $2, $3, $4, $5, $6, $7}' $file ; done > Coverage_in_windows.txt


depth = read_tsv(paste0(TE_RIP_dir, "Coverage_in_windows.txt"),
                             col_names = c("Sample", "Chr", "Start", "Stop",
                                           "Depth", "Nb_bases_with_depth",
                                           "Size", "Fraction_covered")) %>%
    mutate(Sum_bases = Depth * Nb_bases_with_depth) %>%
    group_by(Sample, Chr, Start, Stop) %>%
    summarize(Nb_bases = sum(Nb_bases_with_depth), Sum_depth = sum(Sum_bases)) %>%
    mutate(mean_depth = Sum_depth / Nb_bases)
write_tsv(depth, path = paste0(TE_RIP_dir, "Coverage_in_windows_summary.txt"))
suffix = ".depth_per_window.txt"
files <- list.files(paste0(VAR_dir,"1_Depth_per_window/"), pattern = paste0(suffix, "$"))
smaller_files = sample(files, size = 0, replace =F)
smaller_files = sample(files[grepl("ORE", files)], size = 10, replace =F)
smaller_files[length(smaller_files) + 1] = paste0("ST90ORE_a12_3B_6.chr_1.corrected", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Ukraine_1995_ST95UR_F1c_2", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Chile_1995_STCH95_1G3a_06", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I24a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I25a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I33a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Israel_1992_ISYB1b", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Israel_1992_ISZH2a", suffix)

depth_per_locus = data.frame()

for (i in smaller_files) {
  file_name=paste0(VAR_dir,"1_Depth_per_window/", i)
  sample_name = dplyr::last(str_split(file_name, "/")[[1]]) %>%
  str_replace(., pattern = suffix, replacement = "")
  temp = read_tsv(file_name, col_names = c("Locus_name", "Depth")) %>%
    mutate(ID_file = sample_name)
  depth_per_locus = bind_rows(depth_per_locus, temp)
}


median_depth_cor_per_ind = depth_per_locus %>%
  filter(!grepl("\\[", Locus_name)) %>%
  separate(col = Locus_name, into = c("ignore", "chr", "start", "end")) %>%
  filter(as.numeric(chr) <= 13) %>%
  group_by(ID_file) %>%
  dplyr::summarise(Median_core_depth = median(as.numeric(Depth), na.rm = T))


depth_per_locus = left_join(depth_per_locus, median_depth_cor_per_ind) %>%
  left_join(., readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000)) %>%
  mutate(Normalized_depth = Depth / Median_core_depth) %>%
  filter(Median_core_depth > 10)
#depth = read_tsv(paste0(TE_RIP_dir, "Coverage_in_windows_summary.txt"))
#depth per locus coming from the PAV from pangenome

#t = read_tsv(paste0(TE_RIP_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.nucl_content")) %>%
#  unite("#1_usercol", "2_usercol", "3_usercol", col = "Locus_name", remove = F) %>%
#  ggplot(aes(`5_pct_gc`)) + geom_density()

Nucl_content = read_tsv(paste0(data_dir, "all_19_pangenome.windows.nuc_GC.tab")) %>%
  unite("#1_usercol", "2_usercol", "3_usercol", col = "Locus_name", remove = F)

scale_this <- function(x){
  (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
}


GC_coverage = inner_join(depth_per_locus, Nucl_content,
           by = "Locus_name") %>%
  filter(!grepl("\\[", Locus_name)) %>%
  separate(col = Locus_name, into = c("ignore", "chr", "start", "end")) %>%
  filter(as.numeric(chr) <= 4) %>% #Check GC bias only on core chromosomes to limit accessory bias in coverage
  mutate(scaled_depth = scale_this(Depth))


getAlpha<-function(i){
  #print(i)
  temp = GC_coverage %>%
    filter(ID_file == i) %>%
    mutate(log_depth = log(Depth + 1))

  linearMod <- lm(log_depth ~ `5_pct_gc`, data=temp)  # build linear regression model on full data
  alpha = linearMod$coefficients["`5_pct_gc`"]
  return (alpha)}
v_getAlpha <- Vectorize(getAlpha)

GC_bias = GC_coverage %>%
  dplyr::group_by(ID_file) %>%
  dplyr::summarise(Median_core_depth = mean(Median_core_depth, na.rm = T),
                   Median_depth = mean(Depth, na.rm = T))  %>%
  dplyr::mutate(GC_bias_slope = v_getAlpha(ID_file))
temp = GC_coverage %>%
  filter(chr == 1) %>%
  mutate(GC_percent = round(`5_pct_gc` * 100, 0)) %>%
  mutate(Date_Year = ifelse(is.na(Date_Year), "corrected", Date_Year)) %>%
 group_by(ID_file, GC_percent, Date_Year) %>%
 dplyr::summarize(Average_depth = mean(Depth, na.rm = T),
                  Window_count = n())

p1 = temp %>%
 ggplot(aes(x = GC_percent, y = Average_depth, color = ID_file, linetype = Date_Year)) +
  geom_line() + theme_bw() + geom_hline(yintercept = 3) +
  theme(legend.position = "top") +
  xlim(c(20, 70)) +
  labs(x = "GC percent of the window",
       y = str_wrap("Average depth per window of a given GC percent", width = 20))

p2 = temp %>%
  dplyr::filter(ID_file == unique(temp$ID_file[[1]])) %>%
  ggplot(aes(GC_percent, Window_count)) + geom_bar(stat = "identity") +
  theme_bw() +
  labs(x = "GC percent of the window",
       y = str_wrap("Number of windows of a given GC percent", width = 20))


p3 = read_tsv(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "GC_median", "GC_mean")) %>%
  inner_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
                           sheet = 1, n_max = 1000)) %>%
  filter(Continent_Region == "North America") %>%
  ggplot(aes(GC_median, fill = Collection, color = Collection)) +
  geom_density(alpha = 0.4) +
  theme_bw() +
  labs(x = "Median GC content of the reads aligning on TEs") +
  xlim(c(20, 70)) +
  theme(legend.position = "bottom")

plot_grid(p1, p2, p3, nrow = 3, ncol = 1, rel_heights = c(1, 0.5, 0.7))

I’ve created a new representation of the GC bias and the first test for the correction.

For the top plot, I have sorted the windows of chromosome 1 in GC percent bins (1% bins). For each bin and for each sample, I then measure the average depth for all windows of the corresponding GC percent and this is what is plotted here. In the middle, you can see an histogram of the number of windows for each bin and, at the bottom, is the median GC content of reads aligning on TEs which I use as a proxy to estimate the GC content of TEs in general.

As we have seen before, there is a severe GC bias in the older genomes: in the windows with a GC content similar to TE reads (42 - 47), the coverage is often below 3 (black horizontal line). I know that you had tested the effect of depth on your method, Ursula, but this is quite an extreme difference between around 20 to 3… Perhaps this could be tested with a manually GC-biased sample.

I have tried the correction on one sample. You can see it in the top panel as a dashed purple line. It really does correct! Obviously the most extreme values don’t get corrected because 0 reads is 0 reads… But it’s not as bad as I expected honestly!

I did also try the in silico sequencing with GC bias but I could find only one software which claimed to include GCbias in simulated reads and it really did not go as extreme as we needed, unfortunately.

# Selection a smaller number of samples to illustrate the GC bias
temp = sample(GC_coverage %>% pull(ID_file) %>% unique(), size = 9, replace =F)
GC_coverage %>%
  filter(ID_file %in% temp) %>%
  ggplot(aes(`5_pct_gc`, scaled_depth)) +
    geom_hex(bins = 70) +
    scale_fill_continuous(type = "viridis") +
    theme_bw() + facet_wrap(~ID_file, scale = "free") +
    geom_smooth(method = "lm", se = FALSE,
                color = "#29AF7F", size= 0.7) +
    labs(x = "GC percent per window",
         y = "Depth per window",
         title = "Illustration of the GC bias",
         subtitle = "I use the slope of a linear model to infer the GC bias.")

#The following block is silenced.
#It was meant to check whether I am indeed getting the same results with the model and the plot.
silence = 'i=temp[1]
temp = GC_coverage %>%
     filter(ID_file == i) %>%
     mutate(log_depth = scaled_depth)
linearMod <- lm(log_depth ~ `5_pct_gc`, data=temp)  # build linear regression model on full data
p = GC_coverage %>%
  filter(ID_file == i) %>%
ggplot(aes(`5_pct_gc`, scaled_depth)) +
  geom_hex(bins = 70) +
  scale_fill_continuous(type = "viridis") +
  theme_bw() + facet_wrap(~ID_file, scale = "free") +
  geom_smooth(method = "lm", se = FALSE, color = "#29AF7F")
p + geom_abline(intercept = linearMod$coefficients["(Intercept)"], slope = linearMod$coefficients["`5_pct_gc`"]) +
  ylim(c(-6, 15))'


# Checking on a possible link between depth and GC bias
p1 = median_depth_cor_per_ind %>%
  ggplot(aes(Median_core_depth)) +
  geom_density(fill = mycolorsCorrel[7], col = mycolorsCorrel[1]) +
  theme_bw() +
  labs(x = "Median depth on core chromosmome")
p2 = ggplot(GC_bias, aes(GC_bias_slope)) + geom_density()+
  geom_density(fill = myColors[1], col = myColors[1], alpha = 0.8) +
  theme_bw()+
  labs(x = "GC bias")

top = cowplot::plot_grid(p1, p2, labels = c("A", "B"))

p3 = inner_join(GC_bias, TE_RIP, by = "ID_file") %>%
  ggplot(aes( Median_core_depth, GC_bias_slope, text = for_display, color = Collection)) +
  geom_point() + theme_light() +
  labs(x = "Median depth on core chromosmome",
       y = "GC bias")

cowplot::plot_grid(top, p3, ncol = 1, nrow = 2, labels = c("", "C"))

# And now let's investigate if there is a correlation between GC bias and the values of interest from before
temp = inner_join(GC_bias, TE_RIP, by = "ID_file") %>%
  pivot_longer(cols = c(Composite_median, Percent_TE_Reads), names_to = "Confounded estimate", values_to = "Estimated value") %>%
  ggplot(aes(`Estimated value`, GC_bias_slope, text = for_display, color = Collection)) +
  geom_point() + theme_light() +
  facet_wrap(vars(`Confounded estimate`), scales = "free") +
  labs(x = "",
       y = "GC bias (slope of the per window estimate)",
       title = "Effect of GC bias on TE and RIP estimation")
ggplotly(temp)
t = TE_RIP %>%
     ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = Collection,
                shape = as.character(Date_Year), text = for_display)) +
     geom_point(alpha = 0.8) +
     theme_bw()
ggplotly(t)

I tried to correct the GC bias by using deeptools correctGCBias on one sample from the Oregon population. I then rerun the PCA for the TE content. Although the individual values for the depth did change, it did not change the place of the sample in the PCA.

I don’t know if the GC bias is the reason for the clear difference between the Hartmann dataset or if the two are consequence of something else. Either way, it does not seem reasonable to compare this dataset to the rest. It seems that, inside of this dataset, the observed differences match with what can be inferred from the rest of the samples.

RIP along the chromosomes

RIP_in_high_copies_TE = read_tsv(paste0(TE_RIP_dir, "All_genomes.all_TEs.GC.RIP.tab"),
             col_names = c("Sample", "Seq_ID", "GC", "Product_index", "Substrate_index", "Composite_index"))

temp = RIP_in_high_copies_TE %>%
  separate(Seq_ID, into = c("Sample", "Chrom", "Start", "End"),
           sep = "\\.|-|:", remove = F) %>%
  dplyr::mutate(Start = as.integer(Start), End = as.integer(End)) %>%
  dplyr::mutate(Coord = (Start + End)/2)

#temp %>%
#  filter(Chrom == "chr_4") %>%
#  filter(Composite_index <= 2 & Composite_index >= - 2 )%>%
#  ggplot(aes(Coord, Composite_index, col = Composite_index)) +
#    geom_point() +
#    facet_wrap (vars(Sample), ncol = 1) +
#  theme_bw() +
#  scale_color_viridis_c()

temp %>%
  dplyr::mutate(Nb = str_pad(round((Coord / 100000), 0), 6, pad = "0")) %>%
  filter(Composite_index >= -2 & Composite_index<= 2) %>%
  #filter(Chrom == "chr_1") %>%
  unite(Window, Chrom, Nb) %>%
  group_by(Sample, Window) %>%
  dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
  ggplot(aes(Window, Sample, fill = Composite)) +
    geom_tile() +
  theme_bw() +
  scale_fill_viridis_c()

temp %>%
  filter(Composite_index >= -3 & Composite_index<= 3) %>%
  mutate(Chr = as.integer(str_replace(Chrom, "chr_", ""))) %>%
  group_by(Sample, Chr) %>%
  dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
  ggplot(aes(Chr, Sample, fill = Composite)) +
    geom_tile() +
  theme_bw() +
  scale_fill_viridis_c()

Is dim2 present in its intact version in some populations?

Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.

#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf
dir_source=/legserv/NGS_data/Zymoseptoria/Data_files/Illumina_Denovo/Hartmann_2017_Denovo/
list_spades=$(ls $dir_source)
out_file=/data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/Results_blast_dim2.txt
echo "sample qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" > $out_file
for x in $list_spades ;
  do
  sample=$(echo $x | sed 's/\.R/_R/' | sed 's/\.S\./_S\./' | cut -d '.' -f 3) ; echo $sample $x ;

rsync -avP \
  ${dir_source}${x}/scaffolds.fasta \
  /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/

python2 /userhome/alice/Scripts/Rename_fragments_in_fasta.py  \
    -i /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/scaffolds.fasta \
    -o /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}.fasta \
    --simple -f spades

~/Software/ncbi-blast-2.10.0+/bin/makeblastdb \
  -dbtype nucl \
  -in /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}.fasta \
  -out /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}

~/Software/ncbi-blast-2.10.0+/bin/blastn \
  -query /userhome/alice/WW_project/WW_TE_RIP/Zt10_dim2_from_MgDNMT_deRIP.fa \
  -db /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample} \
  -outfmt 6  | \
  awk -v sample="${sample}" 'BEGIN {OFS = " "} {print sample, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12}' >> \
  $out_file

 done


while read sample ;
do

out_file=/data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/Results_blast_dim2_${sample}.txt
#Software directories
BWAPATH=/userhome/alice/Software/bwa-0.7.17/
BEDTOOLS_PATH=/userhome/alice/Software/
SAMTOOLS_PATH=/userhome/alice/Software/samtools-1.10/
BOWTIE_PATH=/userhome/alice/Software/bowtie2-2.4.1-linux-x86_64/
SCRIPTS_PATH=/userhome/alice/Scripts/
REF_NAME=/userhome/alice/WW_project/WW_TE_RIP/Badet_BMC_Biology_2020_TE_consensus_sequences
SPADES_PATH=/userhome/alice/Software/SPAdes-3.14.1-Linux/bin/
BLAST_PATH=/userhome/alice/Software/ncbi-blast-2.10.0+/bin/

# Files directories
DATA_DIR=/data3/alice/WW_project/Data/
WWTERIP_DIR=/data3/alice/WW_project/WW_TE_RIP/
RIP_DIR=${WWTERIP_DIR}0_RIP_estimation/
RIP_raw0=${RIP_DIR}0_BAM_temp/
RIP_raw1=${RIP_DIR}1_Fastq_from_bam/
RIP_aln=${RIP_DIR}2_Aln_TE_consensus/
RIP_est=${RIP_DIR}3_RIP_estimation/
DIM2_DIR=${WWTERIP_DIR}1_Ztdim2_detection/1_Blast_from_denovo_assemblies/
DIM_denovo=${DIM2_DIR}0_Spades/
DIM_blast=${DIM2_DIR}1_Blast_dim2_deRIPped/


#Getting fastq reads
${SAMTOOLS_PATH}samtools sort \
  -n \
  -o ${DATA_DIR}${sample}.sorted.bam \
  ${DATA_DIR}${sample}.bam
rm ${DATA_DIR}${sample}.bam

${BEDTOOLS_PATH}bedtools bamtofastq \
  -i ${DATA_DIR}${sample}.sorted.bam \
  -fq ${DATA_DIR}${sample}_R1.fq \
  -fq2 ${DATA_DIR}${sample}_R2.fq

#De novo assembly
rm -r ${DIM_denovo}${sample}
${SPADES_PATH}spades.py  \
  -o ${DIM_denovo}${sample} \
  --careful \
  -1 ${DATA_DIR}${sample}_1.fq.gz \
  -2 ${DATA_DIR}${sample}_2.fq.gz

#
python2 ${SCRIPTS_PATH}Rename_fragments_in_fasta.py  \
    -i ${DIM_denovo}${sample}scaffolds.fasta \
    -o ${DIM_blast}${sample}.fasta \
    --simple -f spades

${BLAST_PATH}makeblastdb \
  -dbtype nucl \
  -in ${DIM_blast}${sample}.fasta \
  -out ${DIM_blast}${sample}

${BLAST_PATH}blastn \
  -query /userhome/alice/WW_project/WW_TE_RIP/Zt10_dim2_from_MgDNMT_deRIP.fa \
  -db ${DIM_blast}${sample} \
  -outfmt 6  | \
  awk -v sample="${sample}" 'BEGIN {OFS = " "} {print sample, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12}' >> \
  $out_file

done

Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.

In order to identify the native dim2 copy in genomes, I look for several things:

  • the blast match has to be on the same contig as at least one of the two known flanking genes
  • the blast match has to be between the two flanking genes if both are on the same contig or less than 10 kb from the known flanking gene on the same contig
system(paste0("cat ", DIM2_DIR, "Results_blast_dim2*txt > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))

length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2

dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
                                col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
                                              "mismatch", "gapopen", "qstart", "qend",
                                              "sstart", "send", "evalue", "bitscore"))

flank1 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank1") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank1) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n())

flank2 = dim2_blast_results_complete %>%
  filter(gene == "dim2_flank2") %>%
  dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
  dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
  filter(length > threshold_length_flank2) %>%
  pivot_wider(names_from = gene, values_from = sseqid) %>%
  dplyr::select(-length, -sstart, -send) %>%
  group_by(sample) %>%
  dplyr::mutate(nb_gene = n())

#First let's extract some information such as finding the middle coordinates for the
# 3 genes of interest
dim2_blast_results = dim2_blast_results_complete %>%
  dplyr::filter(gene == "dim2") %>%
  full_join(., flank1, by = "sample") %>%
  full_join(., flank2, by = "sample") %>%
  dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
                dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
                midcoord_flank1 = replace_na(midcoord_flank1, "0"),
                midcoord_flank2 = replace_na(midcoord_flank2, "0")) %>%
  dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
                midcoord_flank2 = as.numeric(midcoord_flank2))  %>%
  dplyr::mutate(ave_coord = (sstart + send)/2)  %>%
  dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank2, midcoord_flank1))  %>%
  dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
                                midcoord_flank1, midcoord_flank2))

#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
  dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
  dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
                           ifelse(is_same_contig == "Both",
                                  ifelse(ave_coord >= min_fl &
                                         ave_coord <= max_fl ,
                                  "Distance_OK", "Not_between"),
                                  ifelse(is_same_contig == "Flank1",
                                         ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"),
                                  ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
                                  "Distance_OK", "Too_far"))))) %>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
                                         ifelse(distance == "Distance_OK", ">1", "0")))



  #dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%



dim2_blast_results %>%
  ggplot(aes(length)) +
    geom_density(alpha = 0.7) +
    theme_bw() +
    geom_vline(aes(xintercept = threshold_length_dim2), color = "gray70", size = 0.6) +
    labs(x = "Length (bp)",
         y = "Density",
         title = "Length of all blast matches against Zt10dim2",
         subtitle = str_wrap(paste("There is a large range in the size of the matches.",
                                   " In order to ignore very short matches, I could set up a threshold",
                                   " visualized here as the grey line."),
         width = 70))

However, the length also greatly varies for copies for which we were able to detect one or two of the flanking genes on the same contig.

p1 = dim2_blast_results %>%
  ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
    geom_density(alpha = 0.7) +
    theme_bw() +
    labs(x = "Length (bp)",
         y = "Density",
         title = str_wrap("Length of all blast matches against Zt10dim2", width = 30))


p3 = dim2_blast_results %>%
  ggplot(aes(Nb_flanking_found_2cat, fill = is_same_contig)) +
    geom_bar(alpha = 0.7) +
    theme_bw() +
  labs ( x = "Number of flanking genes on the same contig",
         y = "Number of copies",
         title = str_wrap("Numbers of copies found in the same contig as the flanking genes ", width = 30)) +
  theme(legend.position = "none")

legend_b <- get_legend(
  p1 +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "bottom")
)

prow <- plot_grid(
  p1 + theme(legend.position="none"),
  p3 + theme(legend.position="none"),
  align = 'vh',
  labels = c("A", "B"),
  hjust = -1,
  nrow = 1)

plot_grid(prow, legend_b, ncol = 1, rel_heights = c(1, .1))

Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene. And I will then make some plots using only these original copies to investigate the geographical distribution of both length and identity with the dim2 copy of Zt10.

temp = dim2_blast_results %>%
  group_by(sample) %>%
  dplyr::summarize(n_matches = n(),
                   n_long_matches = sum(length > 1000))

sum_dim2_blast = dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  group_by(sample) %>%
  dplyr::summarize(length_sum = sum(length),
                   pident_wm = weighted.mean(pident, length),
                   n_matches_on_native_contigs = n()) %>%
  filter(length_sum < length_dim2 + 200) %>%
  inner_join(., temp) %>%
  inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))



#Plots of identity vs length of the original copy
scatter <- sum_dim2_blast %>%
  ggplot(aes(length_sum, pident_wm,
             color = Continent_Region,
             shape = as.character(n_matches_on_native_contigs))) +
  geom_point() +
  theme_bw() + Color_Continent +
  theme(legend.position = "none") +
  labs( x = "Length of the recovered native copy",
        y = "Identity with Zt10dim2")

hist_top = sum_dim2_blast %>%
  filter(Continent_Region != "Asia") %>%
  ggplot(aes(Continent_Region, length_sum,
             fill = Continent_Region, color = Continent_Region)) +
  geom_boxplot(alpha = 0.6, width = 0.4) +
  theme_bw() +
  Fill_Continent + Color_Continent+
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7),
        axis.text.y = element_text(size = 7)) +
  coord_flip() +
  labs( x = "", y = "")

hist_right = sum_dim2_blast %>%
  filter(Continent_Region != "Asia") %>%
  ggplot(aes(Continent_Region, pident_wm, fill = Continent_Region)) +
  geom_violin(alpha = 0.6) +
  theme_bw() + Fill_Continent +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
  labs( x = "", y = "")


#Options for top right plot
legend_b <- get_legend(
  hist_top +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "right")
)

empty <- ggplot()+geom_point(aes(1,1), colour="white")+
         theme(axis.ticks=element_blank(),
               panel.background=element_blank(),
               axis.text.x=element_blank(), axis.text.y=element_blank(),           
               axis.title.x=element_blank(), axis.title.y=element_blank())




#Gather all
cowplot::plot_grid(hist_top, legend_b, scatter, hist_right,
                   ncol=2, nrow=2,
                   rel_widths=c(1.2, 1), rel_heights=c(1, 1),  
                   align = 'vh')

sum_dim2_blast %>%
  filter(Continent_Region != "Asia") %>%
  dplyr::mutate(Nb_frag = as.character(n_matches_on_native_contigs))  %>%
  dplyr::count(Nb_frag, Collection, Continent_Region) %>%
  ggplot(aes(Collection, Continent_Region, color = Collection)) +
  geom_point(aes(size = n), stat = "identity", alpha = 0.6) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
  labs( x = "", y = "") +
  Fill_Continent +
  facet_wrap(vars(Nb_frag))

Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).

Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.

p1 = sum_dim2_blast %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

p2 = sum_dim2_blast %>%
  ggplot(aes(as.numeric(Composite_median), n_long_matches, color = Continent_Region,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Number of long blast matches (above 1kb)",
                      width = 20))

p3 = sum_dim2_blast %>%
  ggplot(aes(x = pident_wm, y = n_long_matches, color = Continent_Region,
                      shape = Collection)) +
  geom_point() + Color_Continent + theme_bw()+
    labs(x = str_wrap("Identity of the native match",
                      width = 20),
         y = str_wrap("Number of long blast matchess (above 1kb)",
                      width = 20),
         color = "Geographical group") +
  theme(axis.title=element_text(size=10))

bottom_row <- cowplot::plot_grid(p1, p2, labels = c('B', 'C'), label_size = 12)


cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$Composite_median, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$pident_wm and sum_dim2_blast$Composite_median
## S = 12298290, p-value = 1.104e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2613409
cor.test(sum_dim2_blast$n_long_matches, sum_dim2_blast$Composite_median, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$n_long_matches and sum_dim2_blast$Composite_median
## S = 16448207, p-value = 0.7951
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.01208886
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$n_long_matches, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sum_dim2_blast$pident_wm and sum_dim2_blast$n_long_matches
## S = 20238861, p-value = 2.779e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2155852
cowplot::plot_grid(p3, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)

p1 = sum_dim2_blast %>%
  filter(Collection == "Hartmann") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

p2 = sum_dim2_blast %>%
  filter(Collection != "Hartmann") %>%
  ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Identity of the native match",
                      width = 20))

cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)

#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(pident_wm) ~ Continent_Region + Collection ,
          data=sum_dim2_blast)
summary(model)   ### Will show overall p-value and r-squared
## 
## Call:
## lm(formula = as.numeric(pident_wm) ~ Continent_Region + Collection, 
##     data = sum_dim2_blast)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7087 -1.0411 -0.1404  0.7935  7.7716 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       93.7009     1.0036  93.361  < 2e-16 ***
## Continent_RegionAsia              -0.4458     1.5901  -0.280   0.7794    
## Continent_RegionEurope            -3.7012     0.6334  -5.843 1.16e-08 ***
## Continent_RegionMiddle East        2.9703     0.6731   4.413 1.36e-05 ***
## Continent_RegionNorth America     -5.3291     0.6444  -8.270 2.72e-15 ***
## Continent_RegionOceania           -2.9616     0.7423  -3.990 8.04e-05 ***
## Continent_RegionSouth America     -5.3007     0.8134  -6.516 2.47e-10 ***
## CollectionC2                       0.8472     0.9656   0.877   0.3809    
## CollectionC3                       0.3463     1.0660   0.325   0.7455    
## CollectionHartmann                 0.2642     0.8528   0.310   0.7569    
## CollectionJGI                      0.2275     0.8383   0.271   0.7863    
## CollectionMM_NZ_TAS               -1.9998     0.9509  -2.103   0.0362 *  
## CollectionThird_batch_2018_BM_TM   0.3948     0.8651   0.456   0.6484    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.06 on 355 degrees of freedom
##   (96 observations deleted due to missingness)
## Multiple R-squared:  0.6487, Adjusted R-squared:  0.6368 
## F-statistic: 54.62 on 12 and 355 DF,  p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")  
## Anova Table (Type II tests)
## 
## Response: as.numeric(pident_wm)
##                  Sum Sq  Df F value    Pr(>F)    
## Continent_Region 2285.1   6 89.7677 < 2.2e-16 ***
## Collection        119.6   6  4.6983 0.0001284 ***
## Residuals        1506.2 355                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
## 
## Call:
## lm(formula = as.numeric(pident_wm) ~ Continent_Region + Collection, 
##     data = sum_dim2_blast)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7087 -1.0411 -0.1404  0.7935  7.7716 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       93.7009     1.0036  93.361  < 2e-16 ***
## Continent_RegionAsia              -0.4458     1.5901  -0.280   0.7794    
## Continent_RegionEurope            -3.7012     0.6334  -5.843 1.16e-08 ***
## Continent_RegionMiddle East        2.9703     0.6731   4.413 1.36e-05 ***
## Continent_RegionNorth America     -5.3291     0.6444  -8.270 2.72e-15 ***
## Continent_RegionOceania           -2.9616     0.7423  -3.990 8.04e-05 ***
## Continent_RegionSouth America     -5.3007     0.8134  -6.516 2.47e-10 ***
## CollectionC2                       0.8472     0.9656   0.877   0.3809    
## CollectionC3                       0.3463     1.0660   0.325   0.7455    
## CollectionHartmann                 0.2642     0.8528   0.310   0.7569    
## CollectionJGI                      0.2275     0.8383   0.271   0.7863    
## CollectionMM_NZ_TAS               -1.9998     0.9509  -2.103   0.0362 *  
## CollectionThird_batch_2018_BM_TM   0.3948     0.8651   0.456   0.6484    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.06 on 355 degrees of freedom
##   (96 observations deleted due to missingness)
## Multiple R-squared:  0.6487, Adjusted R-squared:  0.6368 
## F-statistic: 54.62 on 12 and 355 DF,  p-value: < 2.2e-16
hist(residuals(model), col="darkgray")

#Post-hoc analysis:  mean separation tests
marginal = lsmeans(model, ~ Continent_Region)

pairs(marginal, adjust="tukey")
##  contrast                      estimate    SE  df t.ratio p.value
##  Africa - Asia                   0.4458 1.590 355   0.280 1.0000 
##  Africa - Europe                 3.7012 0.633 355   5.843 <.0001 
##  Africa - Middle East           -2.9703 0.673 355  -4.413 0.0003 
##  Africa - North America          5.3291 0.644 355   8.270 <.0001 
##  Africa - Oceania                2.9616 0.742 355   3.990 0.0016 
##  Africa - South America          5.3007 0.813 355   6.516 <.0001 
##  Asia - Europe                   3.2554 1.505 355   2.164 0.3181 
##  Asia - Middle East             -3.4161 1.532 355  -2.230 0.2822 
##  Asia - North America            4.8833 1.522 355   3.208 0.0243 
##  Asia - Oceania                  2.5158 1.565 355   1.607 0.6777 
##  Asia - South America            4.8549 1.576 355   3.081 0.0358 
##  Europe - Middle East           -6.6715 0.413 355 -16.140 <.0001 
##  Europe - North America          1.6279 0.371 355   4.388 0.0003 
##  Europe - Oceania               -0.7396 0.505 355  -1.464 0.7660 
##  Europe - South America          1.5995 0.637 355   2.511 0.1584 
##  Middle East - North America     8.2994 0.384 355  21.628 <.0001 
##  Middle East - Oceania           5.9319 0.492 355  12.063 <.0001 
##  Middle East - South America     8.2710 0.684 355  12.088 <.0001 
##  North America - Oceania        -2.3675 0.465 355  -5.093 <.0001 
##  North America - South America  -0.0284 0.658 355  -0.043 1.0000 
##  Oceania - South America         2.3391 0.753 355   3.105 0.0334 
## 
## Results are averaged over the levels of: Collection 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD_dim = cld(marginal,
          alpha   = 0.05,
          Letters = letters,  ### Use lower-case letters for .group
          adjust  = "tukey")  ### Tukey-adjusted p-values

CLD_dim
##  Continent_Region lsmean    SE  df lower.CL upper.CL .group
##  North America      88.4 0.334 355     87.5     89.3  a    
##  South America      88.4 0.626 355     86.7     90.1  ab   
##  Europe             90.0 0.239 355     89.4     90.7   bc  
##  Oceania            90.8 0.424 355     89.6     91.9    c  
##  Asia               93.3 1.500 355     89.2     97.3    cde
##  Africa             93.7 0.622 355     92.0     95.4     d 
##  Middle East        96.7 0.378 355     95.7     97.7      e
## 
## Results are averaged over the levels of: Collection 
## Results are given on the as.numeric (not the response) scale. 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 7 estimates 
## Note: contrasts are still on the as.numeric scale 
## P value adjustment: tukey method for comparing a family of 7 estimates 
## significance level used: alpha = 0.05
CLD_dim$.group=gsub(" ", "", CLD_dim$.group)

text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))

And then as boxplots per continental region.

p1 = sum_dim2_blast %>%
  group_by(Continent_Region) %>%
  dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
  ggplot(aes(Continent_Region, pident_wm,
             fill = Continent_Region)) +
  geom_violin(aes(fill = Continent_Region), alpha = 0.5) +
    geom_jitter(aes(col = Continent_Region), size = 0.8, alpha = 0.8, width = 0.2, height = 0.1) +
  Fill_Continent + Color_Continent  +
    theme_light() +
    theme(
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "none",
    axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
   coord_flip() +
  labs(x = NULL,
       y = "Identity of the native copy to Zt10dim2") +
  geom_text(data = CLD_dim, aes(x = Continent_Region,
                            y = 102,
                            label = .group), color = "black")


p2 = ggplot(sum_dim2_blast, aes(Continent_Region, n_long_matches,
             fill = Continent_Region)) +
  geom_violin() + Fill_Continent +
  theme_light() +
  theme(panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    legend.position = "none",
    axis.text.x = element_text(angle = 40, hjust = 1))+
  labs(x = NULL,
       y = "Number of dim2 copies") + coord_flip()
cowplot::plot_grid(p1, p2, rel_widths = c(1, 1))

dim2 tree

# TODO
# Use beginning and end of dim2 gene (100bp) to get gene boundaries on assemblies.
# Update: I tried. But it really does not work better...
#Here is the code of the comparison in case I need to use it later still

#Selecting start and end coordinates which are found on a contig
# with at least one of the flanking genes.
dim2_start = dim2_blast_results_complete %>%
  filter(gene == "dim2_start") %>%
  full_join(., flank1) %>%
  full_join(., flank2) %>%
  mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
         dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
  mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
  filter(Nb_flanking_found_2cat == ">1")

dim2_end = dim2_blast_results_complete %>%
  filter(gene == "dim2_end") %>%
  full_join(., flank1) %>%
  full_join(., flank2) %>%
  mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
         dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
  mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
                                 "Both",
                                 ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
                                 "None",
                                 ifelse(sseqid == dim2_flank1, "Flank1",
                                 ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
  mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
  filter(Nb_flanking_found_2cat == ">1")


# Table from start/end
#full_join(dim2_start, dim2_end, by = "sample") %>%
#  filter(complete.cases(.)) %>%
#  select(sample, dim2_flank1.x, dim2_flank2.x,
#         gene.x, sseqid.x, sstart.x, send.x, is_same_contig.x, Nb_flanking_found_2cat.x,
#         gene.y, sseqid.y, sstart.y, send.y, is_same_contig.y, Nb_flanking_found_2cat.y) %>%
#  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
#  group_by(is_same_contig.x, Continent_Region) %>%
#  dplyr::summarise(n()) %>%
#  pivot_wider(names_from = Continent_Region, values_from = `n()`)

# Table from the "simple" blast as comparison
#dim2_blast_results %>%
#  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
#  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
#  filter(length > 2500) %>%
#  dplyr::count(is_same_contig, Continent_Region) %>%
#  pivot_wider(names_from = Continent_Region, values_from = n)

#Keep only copies with:
#   - both flanking genes
#   - one flanking gene but a length of 2500 at least

dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  filter(length > 2500) %>%
  dplyr::count(is_same_contig, Continent_Region) %>%  
  dplyr::group_by(Continent_Region) %>%
  dplyr::mutate(Somme_per_collection = sum(n)) %>%
  mutate(prop = n*100/Somme_per_collection) %>%
  ggplot(aes(x = is_same_contig, y = Continent_Region, fill = prop)) +
    geom_tile() +
    theme_bw() +
    scale_fill_viridis_c() +
  labs (title = str_wrap(paste0("Proportion of long dim2 matches ",
                                "with at least one flanking gene"),
                         width = 70))

#Write the bed file to extract the sequences
dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  dplyr::filter(length > 2500) %>%
  dplyr::mutate(start = ifelse(sstart > send, send, sstart),
                end = ifelse(sstart > send, sstart, send)) %>%
  dplyr::select(sample, sseqid, start, end) %>%
  write_delim(paste0(TE_RIP_dir, "Coordinates_dim2_all_samples.bed"),
            col_names = F)


#This command has to be run on the cluster
#while read sample chr start end ; do
# echo -e "${chr}\t${start}\t${end}" > temp.bed ;
# ~/Software/bedtools getfasta \
#    -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta \
#    -bed temp.bed | sed "s/>/>${sample}:/" >> \
#    /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Native_dim2_all_samples.fasta ;
#done < /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Coordinates_dim2_all_samples.bed

#Run through the website because laziness
# mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output

For the following tree, I used the gene alignment from mafft (online) and created a neighbor-joining tree. The gene sequence from Z. passerinii was used as the outgroup sequence. Then, I attempt to represent both the geographical origin or the samples and the RIP level.

#Prep data to add to tree
temp2 = dim2_blast_results %>%
  dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
  left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
  dplyr::filter(length > 2500) %>%
  dplyr::mutate(start = ifelse(sstart > send, send, sstart),
                end = ifelse(sstart > send, sstart, send)) %>%
  unite(coord, start, end, sep = "-") %>%
  dplyr::mutate(no_dot = stringr::str_replace(sseqid, "\\.", "_"))%>%
  unite(contig2, sample, no_dot, coord, sep = "_", remove = F)


#Read tree in
#details from the mafft website about the tree
#Size = 237 sequences × 1214 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 100

tree = as_tibble(treeio::read.tree("/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk")) %>%
  mutate(label_copy = label) %>%
  separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
  dplyr::mutate(nb = as.integer(nb)) %>%
  dplyr::mutate(label_OG = label) %>%
  dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
  full_join(., temp2, by = c("contig2")) %>%
  dplyr::mutate(label_temp = ifelse(is.na(sample), ifelse(is.na(contig), label, contig), sample)) %>%
  unite(col = "label_new", nb, label_temp, sep = "_")

tree2 = as_tibble(treeio::read.tree("/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk"))

temp = tree %>%
  dplyr::select(label, node, label_new, Continent_Region, Composite_median, Collection) %>%
  mutate(Composite_median = ifelse(is.na(Composite_median), 0, Composite_median)) %>%
  filter(!is.na(label)) %>%
  mutate(RIP = ifelse(Composite_median >= 1.5, "High", ifelse(Composite_median <= 1, "Low", "Medium")))

#Find the outgroup!
root_node = tree2 %>%
  filter(grepl("Zpa63", label)) %>%
  dplyr::select(node) %>%
  pull()


rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))

p2 <- p + geom_tippoint(aes(color = Continent_Region)) +
  theme(legend.position = "right") +
  Color_Continent

p3 <- p + geom_tippoint(aes(color = RIP), alpha = 0.5) +
  theme(legend.position = "right") +
  scale_color_manual(values = c("darkred", "yellow", "orange"))

p4 <- p + geom_tippoint(aes(color = Collection), alpha = 0.5) +
  theme(legend.position = "right")

cowplot::plot_grid(p2, p3)

df = temp %>% dplyr::select(RIP)
rownames(df) <- temp$label
gheatmap(p2, df, width=.2) +
  scale_fill_manual(values = c("darkred", "yellow", "orange"), name = "RIP") +
  labs(title = "Gene tree for the dim2 sequence",
       y = "")

Based on these trees, there is some clustering according to geography. Additionally, it looks like the Middle-Eastern samples have either a high or low RIP level on their TE, whereas the level is rather in the middle of the range elsewhere…

Gene duplication and RIP

PacBio samples from the pangenome

If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.

Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  group_by(isolate, N_genes) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::filter(N_genes <= 10) %>%
  ggplot(aes(isolate, Percent, fill = N_genes)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
  scale_fill_gradient(low = "#EAEAEA", high = "#294D4A", na.value = NA)

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
  group_by(isolate, N_genes_cat) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))

Badet_pan_table %>%
  dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
  group_by(isolate) %>%
  dplyr::mutate(nb_genes = n()) %>%
  dplyr::filter(N_genes >= 2) %>%
  dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
  group_by(isolate, N_genes_cat) %>%
  dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
  dplyr::mutate(Percent = 100 * count / nb_genes) %>%
  dplyr::filter(N_genes_cat == 2) %>%
  ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 40, hjust = 1))

On Illumina samples

temp = inner_join(sum_dim2_blast, duplicated_genes, by = c("sample" = "ID_file", "Continent_Region")) %>%
  filter(Nb_duplicated_genes < 400)


p1 = ggplot(temp, aes(as.numeric(Composite_median),
                      Nb_duplicated_genes,
                      color = Continent_Region,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw()  +
    labs(x = str_wrap(paste("RIP composite index",
                            " (median per isolate)"),
                      width = 20),
         y = str_wrap("Number of duplicated genes",
                      width = 20))

p2 = ggplot(temp, aes(pident_wm,
                      Nb_duplicated_genes,
                      color = Continent_Region,
                      shape = Collection))+
  geom_point() + Color_Continent +
  theme_bw() + theme(legend.position = "none") +
    labs(x = str_wrap(paste("Identity of the native match"),
                      width = 20),
         y = str_wrap("Number of duplicated genes",
                      width = 20))

legend_b <- get_legend(
  p1 +
    theme(legend.position = "right")
)
top = cowplot::plot_grid(p1+ theme(legend.position = "none"), p2, ncol = 1)
cowplot::plot_grid(top, legend_b, nrow = 1)



Adaptation to local climatic conditions


For the following points, different types of data need to be collected.

Genomic data: ideally, we would work from both SNPs and CNV of genes as much as possible (not including TE at the moment, because too complex/uncertain?) + Add the structural variants if/when we can detect them from Illumina data?

Geographic/environmental data: For this, I will use the sampling site of each isolate (as precisely as I can manage) to approximate environmental parameters such as temperature, precipitation or solar radiation. One possibility to find such data is this website, https://www.worldclim.org/data/worldclim21.html (published in Fick and Hijmans, 2017), which gives access to climate data for 1970-2018. These can be transformed into bioclimatic variables using the biovars function from the R package dismo (https://rdrr.io/cran/dismo/man/biovars.html).

temp = Zt_meta_for_shiny %>% mutate(X = as.numeric(unlist(Longitude)),
                            Y = as.numeric(unlist(Latitude))) %>%
  dplyr::select(X, Y) %>%
  distinct()

sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
##         min      max
## X -122.9300 175.6576
## Y  -46.2187  59.3294
## Is projected: NA 
## proj4string : [NA]
## Number of points: 97
#Bioclim data (from Worldclim)
Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
                "Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
                "Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
                "Temperature Annual Range (BIO5-BIO6)",
                "Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
                "Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
                "Annual Precipitation", "Precipitation of Wettest Month",
                "Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
                "Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
                "Precipitation of Warmest Quarter","Precipitation of Coldest Quarter)")

bio_list = list()
for (i in c(1:12)) {
  file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
  rast_temp = raster(file_name)    
  bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}

#Adding solar radiations
sol_list = list()
for (i in c(1:12)) {
  file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_srad/wc2.1_10m_srad_", str_pad(i, 2, pad ="0"), ".tif")
  rast_temp = raster(file_name)    
  sol_list[[i]] = raster::extract(rast_temp, sp)
}
sol_months = cbind(temp, do.call(cbind, sol_list))
sol_months$srad_max = apply(sol_months[, c(3:ncol(sol_months))], 1, max)
sol_months = sol_months %>% dplyr::select(X,Y, srad_max)


climate_per_coordinates = cbind(temp, do.call(cbind, bio_list)) %>%
  full_join(., sol_months)
dat = Zt_meta_for_shiny %>% dplyr::select(ID_file, Latitude, Longitude, Sampling_collection) %>%
  full_join(., climate_per_coordinates,
                by= c("Longitude" = "X", "Latitude" = "Y")) %>%
  dplyr::select(-ID_file) %>%
  gather(key = "Bioclim_var", value = "Estimate", -c(Longitude, Latitude, Sampling_collection))

#Define stable colors
myColors = c("#ec9a29", "#0f8b8d", "#143642")
names(myColors) = levels(factor(dat$Sampling_collection))
colScale = scale_colour_manual(name = "Sampling_collection", values = myColors)
colScaleFill = scale_fill_manual(name = "Sampling_collection", values = myColors)

ggplot(dat, aes(Estimate, fill =Sampling_collection, color =Sampling_collection)) +
  geom_histogram(position = "stack") +
  facet_wrap(.~Bioclim_var, scales = "free") +
  theme_classic() + colScale + colScaleFill

Naturally, many of the variables investigated above are highly correlated. It is intuitive for example that the minimum temperature of the coldest month would be correlated to the average temperature of the coldest quarter! Here, I visualize these correlations.

#Correlogram

Ccor = cor(climate_per_coordinates)
corrplot(Ccor, type="lower", order="hclust",
         col = mycolorsCorrel,
         tl.col="black", tl.srt=45, tl.cex = 0.7)

#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)

How to choose the best variables?

  • Only keep one per “correlation block”?
  • Create composite variables with a PCA? Only meaningful if we can interpret the principal component.
country_climate = Zt_meta_for_shiny %>% dplyr::select(Latitude, Longitude, Continent_Region) %>% unique()
bioclim.pca <- prcomp(climate_per_coordinates[,c(2:ncol(climate_per_coordinates))], scale. = TRUE)
ggbiplot(bioclim.pca, obs.scale = 1, var.scale = 1,
         group = country_climate$Continent_Region, ellipse = TRUE) +
  Color_Continent +
  theme(legend.direction = 'horizontal', legend.position = 'top') +
  xlim(-10, 20) + ylim(-5, 20)

tidy_cors <- climate_per_coordinates %>%
  dplyr::select(everything(), -contains("uarter"), -Y, -X) %>%
  correlate() %>%
  stretch()

graph_cors <- tidy_cors %>%
  filter(abs(r) > .3) %>%
  graph_from_data_frame(directed = FALSE)

ggraph(graph_cors) +
  geom_edge_link(aes(edge_alpha = .5, edge_width = abs(r), color = r)) +
  guides(edge_alpha = "none", edge_width = "none") +
  scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "#a8201a")) +
  geom_node_point(color = "black", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_graph() +
  labs(title = "Correlations between bioclimatic variables")

GEA: genotype-environment association

Methods: Several software to explore:

datamash transpose \
    < ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 \
    > ${GEADIR}$VCFNAME.pass_noNA.GT.FORMAT2.transposed
#Create standardized values for the environment variables
climate_per_coord_standard = climate_per_coordinates %>%
  mutate(Temp = scale(`Annual Mean Temperature`),
         Rain = scale(`Annual Precipitation`)) %>%
  dplyr::select(X, Y, Temp, Rain)
#Get genotypes
genotypes = read_tsv(paste0(GEA_dir, vcf_name, ".pass_noNA.GT.FORMAT2.transposed"),
                     col_names = F) %>%
  dplyr::select(-X1) #removes ind_names
ind_list = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.ind"), col_names = "ID_file")
genotypes_temp = bind_cols(genotypes, ind_list)

#Get environmental data
temp_Zt_meta = left_join(Zt_meta, climate_per_coord_standard, by = c("Latitude" = "Y", "Longitude" = "X"))
temp_Zt_meta = left_join(ind_list, temp_Zt_meta %>%
                           dplyr::select(ID_file, Temp, Rain),
                         tibble(name = target), by = "ID_file") %>%
  dplyr::select(-ID_file)  #removes ind_names


mod.lfmm <- lfmm_lasso(Y = genotypes, X = temp_Zt_meta, K=6,
                        nozero.prop = 0.02)
pv <- lfmm::lfmm_test(Y = genotypes,
                 X = temp_Zt_meta,
                 lfmm = mod.lfmm,
                 calibrate = "gif")
pvalues <- pv$calibrated.pvalue

I need to check whether the obtained p-values are similar to what we would expect. Unfortunately, not much in the two examples I have chosen!

#HARD CODED. THIS IS SUPER UGLY

par(mfrow=c(1, 2))
qqplot(rexp(length(pvalues[,"Rain"]), rate = log(10)),
       -log10(pvalues), xlab = "Expected quantile",
       pch = 19, cex = .4, col = "#82C0CC")
abline(0,1)

qqplot(rexp(length(pvalues[,"Temp"]), rate = log(10)),
       -log10(pvalues), xlab = "Expected quantile",
       pch = 19, cex = .4, col = "#FFA62B")
abline(0,1)

par(mfrow=c(1, 1))

Here are the resulting Manhattan plots. Seeing the two qqplots above, none of this should be taken too seriously!

positions = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.GT.FORMAT.pos"), col_names = T)

## Manhattan plot
as_tibble(as.data.frame(pvalues)) %>%
  mutate(SNP = c(1:nrow(pvalues))) %>%
  gather(key = "Bioclimatic variable", value = "pvalue", -SNP) %>%
  ggplot(aes(x=SNP, y = -log10(pvalue), color = `Bioclimatic variable`)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~`Bioclimatic variable`, scales = "free") +
  theme_bw() + scale_color_manual(values = c("#82C0CC", "#FFA62B"))

bind_cols(positions %>% dplyr::select(-POS), as_tibble(as.data.frame(pvalues))) %>%
  mutate(SNP = c(1:nrow(pvalues))) %>%
  gather(key = "Bioclimatic variable", value = "pvalue", -SNP, -CHROM) %>%
  ggplot(aes(x=SNP, y = -log10(pvalue), color = factor(CHROM))) +
  geom_point(alpha = 0.5) +
  facet_grid(rows = vars(`Bioclimatic variable`), scales = "free") +
  theme_bw() + scale_color_manual(values = rep(c("#489fb5", "#82c0cc"), 10))

for CHR in {1..13}
do


  vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
    --keep $ZTLIST.txt \
    --remove-filtered-all --extract-FORMAT-info GT \
    --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
    --maf 0.05 \
    --chr ${CHR} \
    --out ${GEADIR}Chr_${CHR}_only

  vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
    --keep $ZTLIST.txt \
    --remove-filtered-all --extract-FORMAT-info GT \
    --max-missing 1.0 --min-alleles 2 --max-alleles 2 \
    --maf 0.05 \
    --not-chr ${CHR} \
    --out ${GEADIR}Chr_${CHR}_excepted

  for i in only excepted ;
  do
    cat  ${GEADIR}Chr_${CHR}_${i}.GT.FORMAT | cut -f 3- > ${GEADIR}Chr_${CHR}_${i}.GT.FORMAT2
  done
done
from collections import defaultdict

#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.Zt_meta["ID_file"], r.Zt_meta["Coordinates"]))

#Keep a list of the pop names/coordinates to write in the same order later
all_pops = list(set(r.Zt_meta["Coordinates"]))
bayenv_pop_name = r.GEA_dir + "SNPSFILE_pop"
with open(bayenv_pop_name, "w") as temp :
  temp.write("\n".join(all_pops))


for chr in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] :
  print(chr)
  for type in ["only", "excepted"] :
    #Set output file and write coordinates as a header
    bayenv_snps_name = r.GEA_dir + "Chr_" + str(chr) + "_" + type + ".SNPSFILE"
    out = open(bayenv_snps_name, "w")

    #Read the genotype file and estimate frequencies per pop
    with open(r.GEA_dir + "Chr_" + str(chr) + "_" + type + ".GT.FORMAT2", "r") as input_snps :
      for i, snp in enumerate(input_snps) :
        #Setting two dictionaries with values at 0
        dict_snp0 = defaultdict(int)
        dict_snp1 = defaultdict(int)
        Lets_write = True

        #The first line is the name of the isolates
        if i == 0 :
          indv = snp.strip().split("\t")
          Lets_write = False
        else :
          #Keeping isolate name and allelic value together
          alleles = zip(indv, snp.strip().split("\t"))
          #...and counting the O and 1 based on the pop
          for ind, allele in alleles:
            if allele == "0" :
              dict_snp0[dict_pop[ind]] += 1
            elif allele == "1" :
              dict_snp1[dict_pop[ind]] += 1
            else :
              print("Only biallelic please!!!!")
              Lets_write = False
        #If I have not found anything weird, I will write the result to the output file.
        if Lets_write :
          total = sum([int(dict_snp0[pop]) for pop in all_pops]) + sum([int(dict_snp1[pop]) for pop in all_pops])
          out.write("\t".join([str(dict_snp0[pop]) for pop in all_pops]) + "\n")
          out.write("\t".join([str(dict_snp1[pop]) for pop in all_pops]) + "\n")

    out.close()
#TODO paste0(vcf_name, ".pass_noNA.GT.FORMAT2.SNPSFILE_pop") -> py$bayenv_pop_name
climate_per_coord_ordered = left_join(read_tsv(paste0(GEA_dir, "SNPSFILE_pop") , col_names = F),
                                            climate_per_coord_standard %>%
                                              unite(X1, Y, X, remove = F, sep = ";"))
climate_per_coord_ordered %>%
  dplyr::select(-X1, -X, -Y) %>%
  mutate(Temp = round(Temp, 3), Rain = round(Rain, 3)) %>%
  write_delim(paste0(GEA_dir, "ENVIRONFILE_totranspose"), delim = "\t", col_names = F)
nb_pop = nrow(read_tsv(paste0(GEA_dir, "SNPSFILE_pop"), col_names = F))
Sys.setenv(NUMPOPS=nb_pop)
echo $NUMPOPS

head -n 10000 ${GEADIR}SNPSFILE > temp ;
cp /Users/feurtey/Documents/Software/bayenv2_mac/bayenv2 ${GEADIR}
cd ${GEADIR}

${GEADIR}bayenv2 -i temp -p $NUMPOPS -k 50001 > temp.mat
tail -n $(($NUMPOPS + 1)) temp.mat > temp.MATRIXFILE

for CHR in {1..13}
do

   head -n 10000 ${GEADIR}Chr_${CHR}_excepted.SNPSFILE > temp ;
   ${GEADIR}bayenv2 -i temp -p $NUMPOPS -k 50001 > temp.mat ;
   tail -n $(($NUMPOPS + 1)) temp.mat > ${GEADIR}Chr_${CHR}_excepted.MATRIXFILE ;

done
datamash transpose < ${GEADIR}ENVIRONFILE_totranspose > ${GEADIR}ENVIRONFILE
NUMENV=$(cat  ${GEADIR}ENVIRONFILE | wc -l)
cd ${GEADIR}
echo $NUMPOPS $NUMENV

for CHR in {1..13}
do

   #To do: CHECK IF THE PER SNP LOOP DOES WHAT I THINK IT DOES

   #Get number of SNPs
   lnNB=$(cat ${GEADIR}Chr_${CHR}_only.SNPSFILE | wc -l)
   SNPNB=$(($lnNB/2))
   echo $CHR, $SNPNB

   #Run bayenv per snp
   for i in $(seq 1 ${SNPNB}) ;
   do
     CHOSENLINES=$(($i*2))
     head -n $CHOSENLINES ${GEADIR}Chr_${CHR}_only.SNPSFILE | tail -n 2 > temp.SNPFILE

     ./bayenv2 \
      -i temp.SNPFILE \
      -e ENVIRONFILE \
      -m Chr_${CHR}_excepted.MATRIXFILE \
      -p ${NUMPOPS} -n ${NUMENV} \
      -k 10000 -t \
      -o Chr_${CHR}.Bayenv2_out
   done

done
CHR=7
temp = read_tsv(paste0(GEA_dir, "Chr_", CHR, ".Bayenv2_out.bf"), col_names = F)
temp$ID <- seq.int(nrow(temp))
temp %>%
  ggplot(aes(ID, X2)) + geom_point()



Fungicide resistance


This is based on the same scripts made for Fanny Hartmann’s paper on the pairs of populations: looking for alleles already known to be involved in resistance to fungicide.

genotypes_resistance = read_tsv(paste0(fung_dir, "genotypes_tidy_format.tab"),
                                col_names = c("temp", "ID_file", "Allele")) %>%
  separate(temp, into = c("Gene", "AA_change"), sep = "\\.") %>%
   dplyr::mutate(AA_REF = str_sub(AA_change, 1, 1)) %>%
  dplyr::mutate(Allele_type = ifelse(AA_REF == Allele, "Origin", "Mutant")) %>%
  left_join(Zt_meta)


genotypes_resistance %>%
  dplyr::count(AA_change, Gene, Allele_type) %>%
  ggplot(aes(x=n, y=AA_change, fill = Allele_type)) +
  geom_barh(stat="identity") +
  facet_grid(Gene ~ ., scales = "free_y", space = "free_y")

I suspect that there will be an effect of both geography and time on the frequency of these alleles and so I try to represent this here.

#CYP51
temp = genotypes_resistance %>%
  filter(Gene == "CYP51") %>%
  dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(prop_mutant = Mutant/(Mutant + Origin)) %>%
  filter(!is.na(Date_Year)) %>%
  filter(!is.na(Continent_Region)) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0)

temp %>%
  ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the gene CYP51"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance to azole fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

#Beta_tubulin
genotypes_resistance %>%
  filter(Gene == "beta_tubulin") %>%
  dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the beta tubuline gene"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance",
                                 " to benzimidazole fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

# Mitochondrial_cytb
genotypes_resistance %>%
  filter(Gene == "mitochondrial_cytb") %>%
  dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the mitochondrial gene cytb"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance ",
                                 "to QoI, or Quinone outside inhibitors."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

# SDH genes
genotypes_resistance %>%
  filter(grepl("SDH", Gene)) %>%
  unite(Label, Gene, AA_change, remove = T) %>%
  dplyr::count(Label, Continent_Region, Date_Year, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(Label) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
  facet_wrap(.~Label) + scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in one of the SDH genes"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance to SDHI fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")



Virulence to new varieties


Thierry Marcel’s virulence GWAS

I use the alleles identified in the GWAS of Thierry Marcel and track them in the world-wide populations.

virulence_table = read_tsv(paste0(virulence_dir, "French_GWAS_virulence_alleles.tab")) %>%
  separate(VIRULENCE_ALLELES, into = c("VIRULENCE_ALLELE1", "VIRULENCE_ALLELE2"), sep = ",")

french_GWAS_variants = read_tsv(paste0(virulence_dir, "Positions_GWAS.short.vcf"),
                                na = ".")  %>%
  pivot_longer(-c(`#CHROM`, POS, REF, ALT), names_to = "ID_file", values_to = "Allele_nb") %>%
  separate(ALT, into = c("ALT1", "ALT2"), sep = ",") %>%
  mutate(Allele_ID = ifelse(Allele_nb == 0, REF, ifelse(Allele_nb == 1, ALT1, ALT2))) %>%
  left_join(., virulence_table) %>%
  mutate(virulence = ifelse(Allele_ID == NON_VIRULENT_ALLELES, "Non_virulent",
                            ifelse(Allele_ID == VIRULENCE_ALLELE1 | Allele_ID == VIRULENCE_ALLELE2,
                                   "Virulent", NA)))

french_GWAS_variants = inner_join(french_GWAS_variants, Zt_meta)


french_GWAS_variants %>%
  tidyr::unite(position, `#CHROM`, POS) %>%
  dplyr::count(position, Continent_Region,Date_Year, virulence) %>%
  pivot_wider(names_from = virulence, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Virulent + Non_virulent) %>%
  mutate(prop_virulent = Virulent/Number_all) %>%
  ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_virulent, fill = prop_virulent)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Virulence alleles",
                              ""), width = 35),
       subtitle = str_wrap(paste("These alleles were identified in a GWAS study",
                                 " to increase either virulence or aggressiveness",
                                 " on some wheat varieties."), width = 65),
       fill = "Proportion of virulent",
       size = "Proportion of virulent",
       x = "Years", y = "") +
  facet_wrap(vars(position))

Welll… Whatever…

Avr_Stb6

#On the cluster: run blast
while read sample ; do sh Detect_gene_blast.sh ./Directories_new.sh /data2/alice/WW_project/7_Virulence/Avr_Stb6.fasta ${sample} Illumina /data2/alice/WW_project/7_Virulence/${sample}_Avr_Stb6.blast.tab; done < list_Third_batch_Dec_2018.txt

#On the cluster: Gather blast results
cat /data2/alice/WW_project/7_Virulence/*Avr_Stb6.blast.tab > /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt

#On the cluster: Gather blast results
while read p ;
 do
sample=$(echo $p | cut -f 1 -d " " ) ;
 chr=$(echo $p | cut -f 4 -d " ") ;
 start=$(echo $p | cut -f 11 -d " ") ; end=$(echo $p | cut -f 12 -d " ") ;
 echo $sample, $chr, $start, $end ;
 order_start=$(echo -e "$start\n$end"  | sort -n | head -n1);
 order_end=$(echo -e "$start\n$end"  | sort -nr | head -n1);
 echo -e "${chr}\t${order_start}\t${order_end}" > temp.bed ;
 ~/Software/bedtools getfasta \
   -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta\
   -bed temp.bed | \
   sed "s/>/>${sample}:/" >> \
 /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.fasta ;
done < /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt

#And on the website, because laziness
mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output

Here is the tree for the Avr_Stb6 gene. For reference, the allele carried by ST99CH_1A4 gives an avirulent phenotype, while the allele from ST99CH_1A5 is the virulent allele (from Zhong et al. 2017).

#grep ">" ../7_Virulence/Overall_results_blast_Avr_Stb6.fasta | sed 's/>//' > ../7_Virulence/Overall_results_blast_Avr_Stb6.list

temp2 = read_tsv(paste0(virulence_dir, "Overall_results_blast_Avr_Stb6.list"), col_names = "Fasta_header") %>%
  separate(Fasta_header, into = c("ID_file", "chr", "coords"), sep=":", remove = F) %>%
  mutate(contig2 = stringr::str_replace_all(Fasta_header, "[:.]", "_")) %>%
  left_join(.,Zt_meta)

#Read tree in
#details from the mafft website about the tree
#Size = 640 sequences × 319 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 0
#Alignment id = .200930173318269H5fPcXaMbnDOkoF7E0Y3ilsfnormal


tree_file = "Tree_Avr_Stb6.nwk"
tree = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file))) %>%
  mutate(label_copy = label) %>%
  separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
  dplyr::mutate(nb = as.integer(nb)) %>%
  dplyr::mutate(label_OG = label) %>%
  dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
  left_join(., temp2, by = c("contig2")) %>%
  dplyr::mutate(label_temp = ifelse(is.na(ID_file), ifelse(is.na(contig), label, contig), ID_file)) %>%
  unite(col = "label_new", nb, label_temp, sep = "_")

tree2 = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file)))

temp = tree %>%
  mutate(original_vir = ifelse(ID_file == "ST99CH_1A5", "ST99CH_1A5",
  ifelse(ID_file == "ST99CH_1E4", "ST99CH_1E4", NA))) %>%
  dplyr::select(label, node, label_new, Continent_Region, Date_Year, Collection, original_vir) %>%
  filter(!is.na(label))

#Find the outgroup!
root_node = tree2 %>%
  filter(grepl("Zp13", label)) %>%
  dplyr::select(node) %>%
  pull()


rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))

p2 <- p + geom_tippoint(aes(color = Continent_Region), alpha = 0.7) +
  theme(legend.position = "right") +
  Color_Continent

#p2

p3 <- p + geom_tippoint(aes(color = original_vir)) +
  theme(legend.position = "right")  +
  scale_color_manual(values = c(mycolorsCorrel[1], mycolorsCorrel[20]))
cowplot::plot_grid(p2, p3)

p4 <- p + geom_tippoint(aes(color = Date_Year), alpha = 0.7) +
  theme(legend.position = "right")
cowplot::plot_grid(p4, p3)